feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
2026-02-07 09:09:13 -08:00
parent e052778b0a
commit 2a962abcd0
4 changed files with 531 additions and 12 deletions
@@ -1,12 +1,18 @@
-import { describe, it, expect } from 'vitest';
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { vi } from 'vitest';
 import type { Attachment } from '../channels/types.js';
 import type { Message } from './types.js';
 import {
  isSupportedImage,
+  isSupportedAudio,
  attachmentToImageSource,
  buildUserMessage,
  getMessageText,
  hasImages,
+  transcribeAudio,
+  buildUserMessageWithAudio,
+  type AudioTranscriptionConfig,
+  mimeToExtension,
 } from './media.js';

 // ---------------------------------------------------------------------------
@@ -34,6 +40,30 @@ const pdfAttachment: Attachment = makeAttachment({
  filename: 'doc.pdf',
 });

+const oggAudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/ogg',
+  data: 'AAAAAAAAAAAAAAAAAAAA',
+  filename: 'voice.ogg',
+});
+
+const mp3AudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/mpeg',
+  data: 'AAAAAQAAAAAAAEAAABkAAABTQA=', // Base64 of a short MP3
+  filename: 'audio.mp3',
+});
+
+const wavAudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/wav',
+  data: 'UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=', // Base64 of a short WAV
+  filename: 'audio.wav',
+});
+
+const m4aAudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/x-m4a',
+  data: 'AAAAUGV0Zi4xLjAgc291cmNlIGZvciBzdGFydHBvaW50', // Base64 of M4A
+  filename: 'audio.m4a',
+});
+
 // ---------------------------------------------------------------------------
 // 1. isSupportedImage
 // ---------------------------------------------------------------------------
@@ -259,3 +289,319 @@ describe('hasImages', () => {
    expect(hasImages(msg)).toBe(true);
  });
 });
+
+// ---------------------------------------------------------------------------
+// 6. isSupportedAudio
+// ---------------------------------------------------------------------------
+
+describe('isSupportedAudio', () => {
+  // Positive: all supported audio MIME types should return true.
+  it.each([
+    'audio/ogg',
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/webm',
+    'audio/mp4',
+    'audio/x-m4a',
+  ])('returns true for supported type %s', (mime) => {
+    expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(true);
+  });
+
+  // Negative: unsupported MIME types should return false.
+  it.each([
+    'audio/flac',
+    'audio/aac',
+    'audio/wma',
+    'application/pdf',
+    'image/jpeg',
+    'text/plain',
+  ])('returns false for unsupported type %s', (mime) => {
+    expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 7. mimeToExtension
+// ---------------------------------------------------------------------------
+
+describe('mimeToExtension', () => {
+  it('returns correct extension for audio/ogg', () => {
+    expect(mimeToExtension('audio/ogg')).toBe('ogg');
+  });
+
+  it('returns correct extension for audio/mpeg', () => {
+    expect(mimeToExtension('audio/mpeg')).toBe('mp3');
+  });
+
+  it('returns correct extension for audio/wav', () => {
+    expect(mimeToExtension('audio/wav')).toBe('wav');
+  });
+
+  it('returns correct extension for audio/webm', () => {
+    expect(mimeToExtension('audio/webm')).toBe('webm');
+  });
+
+  it('returns correct extension for audio/mp4', () => {
+    expect(mimeToExtension('audio/mp4')).toBe('m4a');
+  });
+
+  it('returns correct extension for audio/x-m4a', () => {
+    expect(mimeToExtension('audio/x-m4a')).toBe('m4a');
+  });
+
+  it('returns bin for unknown MIME type', () => {
+    expect(mimeToExtension('audio/flac')).toBe('bin');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 8. transcribeAudio
+// ---------------------------------------------------------------------------
+
+describe('transcribeAudio', () => {
+  const mockTranscript = 'Hello, this is a test transcription';
+  const originalFetch = global.fetch;
+
+  beforeEach(() => {
+    global.fetch = vi.fn();
+  });
+
+  afterEach(() => {
+    global.fetch = originalFetch;
+  });
+
+  // Positive: transcribes audio with valid config.
+  it('transcribes audio successfully with valid config', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: mockTranscript }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+      apiKey: 'test-key',
+      model: 'test-model',
+    };
+
+    const result = await transcribeAudio(oggAudioAttachment, config);
+
+    expect(result).toBe(mockTranscript);
+    expect(global.fetch).toHaveBeenCalledWith(
+      'https://api.example.com/v1/audio/transcriptions',
+      expect.objectContaining({
+        method: 'POST',
+        body: expect.any(FormData),
+      }),
+    );
+  });
+
+  // Negative: returns placeholder when endpoint is missing.
+  it('returns placeholder message when endpoint is not configured', async () => {
+    const result = await transcribeAudio(oggAudioAttachment);
+
+    expect(result).toBe('[Audio message received but no transcription service is configured]');
+  });
+
+  // Negative: returns placeholder when API fails.
+  it('returns placeholder message when API returns error', async () => {
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: false,
+      status: 500,
+      statusText: 'Internal Server Error',
+      text: async () => 'Internal Server Error',
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await transcribeAudio(oggAudioAttachment, config);
+
+    expect(result).toBe('[Audio message transcription failed]');
+  });
+
+  // Negative: handles network errors gracefully.
+  it('returns placeholder message on network error', async () => {
+    vi.mocked(global.fetch).mockRejectedValue(new Error('Network error'));
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await transcribeAudio(oggAudioAttachment, config);
+
+    expect(result).toBe('[Audio message transcription failed]');
+  });
+
+  // Positive: uses Whisper-1 model by default.
+  it('uses whisper-1 model by default', async () => {
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.openai.com/v1/audio/transcriptions',
+    };
+
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'test' }),
+    } as Response);
+
+    await transcribeAudio(oggAudioAttachment, config);
+
+    expect(global.fetch).toHaveBeenCalledWith(
+      'https://api.openai.com/v1/audio/transcriptions',
+      expect.objectContaining({
+        body: expect.any(FormData),
+      }),
+    );
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 9. buildUserMessageWithAudio
+// ---------------------------------------------------------------------------
+
+describe('buildUserMessageWithAudio', () => {
+  const textMessage = 'What is 2 + 2?';
+  const originalFetch = global.fetch;
+
+  beforeEach(() => {
+    global.fetch = vi.fn();
+  });
+
+  afterEach(() => {
+    global.fetch = originalFetch;
+  });
+
+  // Positive: plain text message when no attachments.
+  it('returns plain text message when no attachments', async () => {
+    const result = await buildUserMessageWithAudio(textMessage);
+
+    expect(result).toEqual({ role: 'user', content: textMessage });
+  });
+
+  // Positive: includes transcription when audio attachment present.
+  it('includes transcription when audio attachment is present', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
+
+    expect(result.role).toBe('user');
+    expect(result.content).toContain('[Voice message]:');
+    expect(result.content).toContain('The answer is 4');
+    expect(result.content).toContain(textMessage);
+  });
+
+  // Positive: transcribes multiple audio attachments.
+  it('transcribes multiple audio attachments', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(
+      textMessage,
+      [oggAudioAttachment, mp3AudioAttachment],
+      config,
+    );
+
+    expect(result.content).toContain('[Voice message]: The answer is 4');
+    expect(result.content).toContain('[Voice message]: The answer is 4');
+  });
+
+  // Positive: audio transcripts appear before original text.
+  it('places audio transcripts before original message text', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
+
+    const content = Array.isArray(result.content) ? result.content : [{ type: 'text' as const, text: result.content }];
+    const textPart = content.find((p) => p.type === 'text') as { type: 'text'; text: string } | undefined;
+    expect(textPart).toBeDefined();
+
+    const textContent = textPart!.text || '';
+    const firstVoiceIndex = textContent.indexOf('[Voice message]:');
+    const textIndex = textContent.indexOf(textMessage);
+
+    expect(firstVoiceIndex).toBeLessThan(textIndex);
+  });
+
+  // Positive: handles mixed image and audio attachments.
+  it('handles mixed image and audio attachments', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(
+      textMessage,
+      [jpegBase64Attachment, oggAudioAttachment, pngUrlAttachment],
+      config,
+    );
+
+    expect(result.role).toBe('user');
+    expect(Array.isArray(result.content)).toBe(true);
+
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(3); // transcription text, image part, text part
+
+    const textPart = parts.find((p) => p.type === 'text');
+    expect(textPart?.text).toContain('[Voice message]:');
+    expect(textPart?.text).toContain(textMessage);
+
+    const imagePart = parts.find((p) => p.type === 'image');
+    expect(imagePart).toBeDefined();
+  });
+
+  // Positive: no transcription when audio config is missing.
+  it('returns original message when audio config is missing', async () => {
+    const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment]);
+
+    expect(result).toEqual({ role: 'user', content: textMessage });
+  });
+
+  // Positive: empty text with audio attachments.
+  it('handles empty text with audio attachments', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'Test' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio('', [oggAudioAttachment], config);
+
+    expect(result.role).toBe('user');
+    expect(result.content).toContain('[Voice message]:');
+  });
+});