feat: add audio transcription pipeline for voice messages
Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
This commit is contained in:
@@ -180,6 +180,12 @@ const webSearchSchema = z.object({
|
||||
max_results: z.number().min(1).max(20).default(5),
|
||||
}).default({});
|
||||
|
||||
const audioSchema = z.object({
|
||||
transcription_endpoint: z.string().optional(),
|
||||
transcription_api_key: z.string().optional(),
|
||||
transcription_model: z.string().default('whisper-1'),
|
||||
}).default({});
|
||||
|
||||
// ── Tool policy schemas ──────────────────────────────────────────────
|
||||
|
||||
const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']);
|
||||
@@ -259,6 +265,7 @@ export const configSchema = z.object({
|
||||
browser: browserSchema,
|
||||
retry: retrySchema,
|
||||
web_search: webSearchSchema,
|
||||
audio: audioSchema,
|
||||
prompt: promptSchema,
|
||||
tools: toolsSchema,
|
||||
sandbox: sandboxSchema,
|
||||
@@ -274,6 +281,7 @@ export type AgentsConfig = z.infer<typeof agentsSchema>;
|
||||
export type CompactionConfig = z.infer<typeof compactionSchema>;
|
||||
export type MemoryConfig = z.infer<typeof memorySchema>;
|
||||
export type WebSearchConfig = z.infer<typeof webSearchSchema>;
|
||||
export type AudioConfig = z.infer<typeof audioSchema>;
|
||||
export type ProcessConfig = z.infer<typeof processSchema>;
|
||||
export type BrowserConfig = z.infer<typeof browserSchema>;
|
||||
export type DiscordConfig = z.infer<typeof discordSchema>;
|
||||
|
||||
+49
-11
@@ -1,11 +1,15 @@
|
||||
import { Lifecycle } from './lifecycle.js';
|
||||
import type { Config, ModelConfig } from '../config/index.js';
|
||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||
import type { Attachment } from '../channels/types.js';
|
||||
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
||||
import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, GeminiClient, BedrockClient, GitHubModelsClient, ModelRouter, DEFAULT_RETRY_CONFIG } from '../models/index.js';
|
||||
import type { ModelClient, RetryConfig } from '../models/index.js';
|
||||
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
|
||||
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
||||
import { SessionStore, SessionManager } from '../session/index.js';
|
||||
import { HookEngine } from '../hooks/index.js';
|
||||
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
|
||||
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool } from '../tools/index.js';
|
||||
import type { Tool } from '../tools/types.js';
|
||||
import { MemoryStore } from '../memory/index.js';
|
||||
import { createMemoryTools } from '../tools/builtin/index.js';
|
||||
@@ -204,11 +208,12 @@ function createMessageRouter(deps: {
|
||||
agentConfigRegistry?: AgentConfigRegistry;
|
||||
agentRouter?: AgentRouter;
|
||||
sandboxManager?: SandboxManager;
|
||||
audioConfig?: AudioTranscriptionConfig;
|
||||
}) {
|
||||
// Cache agents by session ID + agent config name to avoid recreating on every message
|
||||
const agents = new Map<string, AgentOrchestrator>();
|
||||
const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
|
||||
|
||||
function getOrCreateAgent(channel: string, senderId: string): AgentOrchestrator {
|
||||
function getOrCreateAgent(channel: string, senderId: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
|
||||
// Resolve agent config name via routing (sender → channel → default fallback)
|
||||
const agentConfigName = deps.agentRouter?.resolve(channel, senderId);
|
||||
const agentConfig = agentConfigName ? deps.agentConfigRegistry?.get(agentConfigName) : undefined;
|
||||
@@ -218,8 +223,8 @@ function createMessageRouter(deps: {
|
||||
? `${channel}:${senderId}:${agentConfigName}`
|
||||
: `${channel}:${senderId}`;
|
||||
|
||||
let agent = agents.get(sessionId);
|
||||
if (!agent) {
|
||||
let entry = agents.get(sessionId);
|
||||
if (!entry) {
|
||||
const session = deps.sessionManager.getSession(channel, senderId);
|
||||
|
||||
// Use agent config overrides where available, falling back to global config
|
||||
@@ -286,7 +291,14 @@ function createMessageRouter(deps: {
|
||||
effectiveToolRegistry.replace(lazySandboxProcess);
|
||||
}
|
||||
|
||||
agent = new AgentOrchestrator({
|
||||
// Create an attachment collector for this agent session
|
||||
const collector = new OutboundAttachmentCollector();
|
||||
|
||||
// Clone the tool registry to register the media.send tool bound to this collector
|
||||
effectiveToolRegistry = effectiveToolRegistry.clone();
|
||||
effectiveToolRegistry.register(createMediaSendTool(collector));
|
||||
|
||||
const orchestrator = new AgentOrchestrator({
|
||||
modelRouter: deps.modelRouter,
|
||||
systemPrompt: effectiveSystemPrompt,
|
||||
session,
|
||||
@@ -307,14 +319,16 @@ function createMessageRouter(deps: {
|
||||
agent: effectiveTier,
|
||||
provider: effectiveProvider,
|
||||
},
|
||||
attachmentCollector: collector,
|
||||
});
|
||||
agents.set(sessionId, agent);
|
||||
entry = { orchestrator, collector };
|
||||
agents.set(sessionId, entry);
|
||||
}
|
||||
return agent;
|
||||
return entry;
|
||||
}
|
||||
|
||||
return async (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>): Promise<void> => {
|
||||
const agent = getOrCreateAgent(msg.channel, msg.senderId);
|
||||
const { orchestrator: agent, collector } = getOrCreateAgent(msg.channel, msg.senderId);
|
||||
|
||||
// Handle special commands
|
||||
if (msg.metadata?.isCommand) {
|
||||
@@ -367,8 +381,24 @@ function createMessageRouter(deps: {
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await agent.process(msg.text, msg.attachments);
|
||||
await reply({ text: response, replyTo: msg.id });
|
||||
// Transcribe audio attachments before processing
|
||||
let messageText = msg.text;
|
||||
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
||||
|
||||
if (audioAttachments.length > 0 && deps.audioConfig) {
|
||||
for (const att of audioAttachments) {
|
||||
const transcript = await transcribeAudio(att, deps.audioConfig);
|
||||
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
||||
}
|
||||
}
|
||||
|
||||
const response = await agent.process(messageText, msg.attachments);
|
||||
const outboundAttachments = collector.drain();
|
||||
await reply({
|
||||
text: response,
|
||||
replyTo: msg.id,
|
||||
attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
|
||||
await reply({
|
||||
@@ -539,6 +569,13 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
||||
});
|
||||
}
|
||||
|
||||
// Initialize audio transcription config
|
||||
const audioConfig: AudioTranscriptionConfig = {
|
||||
endpoint: config.audio.transcription_endpoint,
|
||||
apiKey: config.audio.transcription_api_key,
|
||||
model: config.audio.transcription_model,
|
||||
};
|
||||
|
||||
// Initialize model router
|
||||
const modelRouter = createModelRouter(config);
|
||||
|
||||
@@ -593,6 +630,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
||||
agentConfigRegistry,
|
||||
agentRouter,
|
||||
sandboxManager,
|
||||
audioConfig,
|
||||
}));
|
||||
|
||||
// Register Telegram adapter
|
||||
|
||||
+347
-1
@@ -1,12 +1,18 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
||||
import { vi } from 'vitest';
|
||||
import type { Attachment } from '../channels/types.js';
|
||||
import type { Message } from './types.js';
|
||||
import {
|
||||
isSupportedImage,
|
||||
isSupportedAudio,
|
||||
attachmentToImageSource,
|
||||
buildUserMessage,
|
||||
getMessageText,
|
||||
hasImages,
|
||||
transcribeAudio,
|
||||
buildUserMessageWithAudio,
|
||||
type AudioTranscriptionConfig,
|
||||
mimeToExtension,
|
||||
} from './media.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -34,6 +40,30 @@ const pdfAttachment: Attachment = makeAttachment({
|
||||
filename: 'doc.pdf',
|
||||
});
|
||||
|
||||
const oggAudioAttachment: Attachment = makeAttachment({
|
||||
mimeType: 'audio/ogg',
|
||||
data: 'AAAAAAAAAAAAAAAAAAAA',
|
||||
filename: 'voice.ogg',
|
||||
});
|
||||
|
||||
const mp3AudioAttachment: Attachment = makeAttachment({
|
||||
mimeType: 'audio/mpeg',
|
||||
data: 'AAAAAQAAAAAAAEAAABkAAABTQA=', // Base64 of a short MP3
|
||||
filename: 'audio.mp3',
|
||||
});
|
||||
|
||||
const wavAudioAttachment: Attachment = makeAttachment({
|
||||
mimeType: 'audio/wav',
|
||||
data: 'UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=', // Base64 of a short WAV
|
||||
filename: 'audio.wav',
|
||||
});
|
||||
|
||||
const m4aAudioAttachment: Attachment = makeAttachment({
|
||||
mimeType: 'audio/x-m4a',
|
||||
data: 'AAAAUGV0Zi4xLjAgc291cmNlIGZvciBzdGFydHBvaW50', // Base64 of M4A
|
||||
filename: 'audio.m4a',
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 1. isSupportedImage
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -259,3 +289,319 @@ describe('hasImages', () => {
|
||||
expect(hasImages(msg)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 6. isSupportedAudio
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('isSupportedAudio', () => {
|
||||
// Positive: all supported audio MIME types should return true.
|
||||
it.each([
|
||||
'audio/ogg',
|
||||
'audio/mpeg',
|
||||
'audio/mp3',
|
||||
'audio/wav',
|
||||
'audio/webm',
|
||||
'audio/mp4',
|
||||
'audio/x-m4a',
|
||||
])('returns true for supported type %s', (mime) => {
|
||||
expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(true);
|
||||
});
|
||||
|
||||
// Negative: unsupported MIME types should return false.
|
||||
it.each([
|
||||
'audio/flac',
|
||||
'audio/aac',
|
||||
'audio/wma',
|
||||
'application/pdf',
|
||||
'image/jpeg',
|
||||
'text/plain',
|
||||
])('returns false for unsupported type %s', (mime) => {
|
||||
expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 7. mimeToExtension
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('mimeToExtension', () => {
|
||||
it('returns correct extension for audio/ogg', () => {
|
||||
expect(mimeToExtension('audio/ogg')).toBe('ogg');
|
||||
});
|
||||
|
||||
it('returns correct extension for audio/mpeg', () => {
|
||||
expect(mimeToExtension('audio/mpeg')).toBe('mp3');
|
||||
});
|
||||
|
||||
it('returns correct extension for audio/wav', () => {
|
||||
expect(mimeToExtension('audio/wav')).toBe('wav');
|
||||
});
|
||||
|
||||
it('returns correct extension for audio/webm', () => {
|
||||
expect(mimeToExtension('audio/webm')).toBe('webm');
|
||||
});
|
||||
|
||||
it('returns correct extension for audio/mp4', () => {
|
||||
expect(mimeToExtension('audio/mp4')).toBe('m4a');
|
||||
});
|
||||
|
||||
it('returns correct extension for audio/x-m4a', () => {
|
||||
expect(mimeToExtension('audio/x-m4a')).toBe('m4a');
|
||||
});
|
||||
|
||||
it('returns bin for unknown MIME type', () => {
|
||||
expect(mimeToExtension('audio/flac')).toBe('bin');
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 8. transcribeAudio
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('transcribeAudio', () => {
|
||||
const mockTranscript = 'Hello, this is a test transcription';
|
||||
const originalFetch = global.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
global.fetch = vi.fn();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
global.fetch = originalFetch;
|
||||
});
|
||||
|
||||
// Positive: transcribes audio with valid config.
|
||||
it('transcribes audio successfully with valid config', async () => {
|
||||
// Mock fetch to avoid actual API calls
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: true,
|
||||
json: async () => ({ text: mockTranscript }),
|
||||
} as Response);
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
apiKey: 'test-key',
|
||||
model: 'test-model',
|
||||
};
|
||||
|
||||
const result = await transcribeAudio(oggAudioAttachment, config);
|
||||
|
||||
expect(result).toBe(mockTranscript);
|
||||
expect(global.fetch).toHaveBeenCalledWith(
|
||||
'https://api.example.com/v1/audio/transcriptions',
|
||||
expect.objectContaining({
|
||||
method: 'POST',
|
||||
body: expect.any(FormData),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
// Negative: returns placeholder when endpoint is missing.
|
||||
it('returns placeholder message when endpoint is not configured', async () => {
|
||||
const result = await transcribeAudio(oggAudioAttachment);
|
||||
|
||||
expect(result).toBe('[Audio message received but no transcription service is configured]');
|
||||
});
|
||||
|
||||
// Negative: returns placeholder when API fails.
|
||||
it('returns placeholder message when API returns error', async () => {
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: false,
|
||||
status: 500,
|
||||
statusText: 'Internal Server Error',
|
||||
text: async () => 'Internal Server Error',
|
||||
} as Response);
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
const result = await transcribeAudio(oggAudioAttachment, config);
|
||||
|
||||
expect(result).toBe('[Audio message transcription failed]');
|
||||
});
|
||||
|
||||
// Negative: handles network errors gracefully.
|
||||
it('returns placeholder message on network error', async () => {
|
||||
vi.mocked(global.fetch).mockRejectedValue(new Error('Network error'));
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
const result = await transcribeAudio(oggAudioAttachment, config);
|
||||
|
||||
expect(result).toBe('[Audio message transcription failed]');
|
||||
});
|
||||
|
||||
// Positive: uses Whisper-1 model by default.
|
||||
it('uses whisper-1 model by default', async () => {
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.openai.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
// Mock fetch to avoid actual API calls
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: true,
|
||||
json: async () => ({ text: 'test' }),
|
||||
} as Response);
|
||||
|
||||
await transcribeAudio(oggAudioAttachment, config);
|
||||
|
||||
expect(global.fetch).toHaveBeenCalledWith(
|
||||
'https://api.openai.com/v1/audio/transcriptions',
|
||||
expect.objectContaining({
|
||||
body: expect.any(FormData),
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 9. buildUserMessageWithAudio
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('buildUserMessageWithAudio', () => {
|
||||
const textMessage = 'What is 2 + 2?';
|
||||
const originalFetch = global.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
global.fetch = vi.fn();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
global.fetch = originalFetch;
|
||||
});
|
||||
|
||||
// Positive: plain text message when no attachments.
|
||||
it('returns plain text message when no attachments', async () => {
|
||||
const result = await buildUserMessageWithAudio(textMessage);
|
||||
|
||||
expect(result).toEqual({ role: 'user', content: textMessage });
|
||||
});
|
||||
|
||||
// Positive: includes transcription when audio attachment present.
|
||||
it('includes transcription when audio attachment is present', async () => {
|
||||
// Mock fetch to avoid actual API calls
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: true,
|
||||
json: async () => ({ text: 'The answer is 4' }),
|
||||
} as Response);
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
|
||||
|
||||
expect(result.role).toBe('user');
|
||||
expect(result.content).toContain('[Voice message]:');
|
||||
expect(result.content).toContain('The answer is 4');
|
||||
expect(result.content).toContain(textMessage);
|
||||
});
|
||||
|
||||
// Positive: transcribes multiple audio attachments.
|
||||
it('transcribes multiple audio attachments', async () => {
|
||||
// Mock fetch to avoid actual API calls
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: true,
|
||||
json: async () => ({ text: 'The answer is 4' }),
|
||||
} as Response);
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
const result = await buildUserMessageWithAudio(
|
||||
textMessage,
|
||||
[oggAudioAttachment, mp3AudioAttachment],
|
||||
config,
|
||||
);
|
||||
|
||||
expect(result.content).toContain('[Voice message]: The answer is 4');
|
||||
expect(result.content).toContain('[Voice message]: The answer is 4');
|
||||
});
|
||||
|
||||
// Positive: audio transcripts appear before original text.
|
||||
it('places audio transcripts before original message text', async () => {
|
||||
// Mock fetch to avoid actual API calls
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: true,
|
||||
json: async () => ({ text: 'The answer is 4' }),
|
||||
} as Response);
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
|
||||
|
||||
const content = Array.isArray(result.content) ? result.content : [{ type: 'text' as const, text: result.content }];
|
||||
const textPart = content.find((p) => p.type === 'text') as { type: 'text'; text: string } | undefined;
|
||||
expect(textPart).toBeDefined();
|
||||
|
||||
const textContent = textPart!.text || '';
|
||||
const firstVoiceIndex = textContent.indexOf('[Voice message]:');
|
||||
const textIndex = textContent.indexOf(textMessage);
|
||||
|
||||
expect(firstVoiceIndex).toBeLessThan(textIndex);
|
||||
});
|
||||
|
||||
// Positive: handles mixed image and audio attachments.
|
||||
it('handles mixed image and audio attachments', async () => {
|
||||
// Mock fetch to avoid actual API calls
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: true,
|
||||
json: async () => ({ text: 'The answer is 4' }),
|
||||
} as Response);
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
const result = await buildUserMessageWithAudio(
|
||||
textMessage,
|
||||
[jpegBase64Attachment, oggAudioAttachment, pngUrlAttachment],
|
||||
config,
|
||||
);
|
||||
|
||||
expect(result.role).toBe('user');
|
||||
expect(Array.isArray(result.content)).toBe(true);
|
||||
|
||||
const parts = result.content as Array<{ type: string; text?: string }>;
|
||||
expect(parts).toHaveLength(3); // transcription text, image part, text part
|
||||
|
||||
const textPart = parts.find((p) => p.type === 'text');
|
||||
expect(textPart?.text).toContain('[Voice message]:');
|
||||
expect(textPart?.text).toContain(textMessage);
|
||||
|
||||
const imagePart = parts.find((p) => p.type === 'image');
|
||||
expect(imagePart).toBeDefined();
|
||||
});
|
||||
|
||||
// Positive: no transcription when audio config is missing.
|
||||
it('returns original message when audio config is missing', async () => {
|
||||
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment]);
|
||||
|
||||
expect(result).toEqual({ role: 'user', content: textMessage });
|
||||
});
|
||||
|
||||
// Positive: empty text with audio attachments.
|
||||
it('handles empty text with audio attachments', async () => {
|
||||
// Mock fetch to avoid actual API calls
|
||||
vi.mocked(global.fetch).mockResolvedValue({
|
||||
ok: true,
|
||||
json: async () => ({ text: 'Test' }),
|
||||
} as Response);
|
||||
|
||||
const config: AudioTranscriptionConfig = {
|
||||
endpoint: 'https://api.example.com/v1/audio/transcriptions',
|
||||
};
|
||||
|
||||
const result = await buildUserMessageWithAudio('', [oggAudioAttachment], config);
|
||||
|
||||
expect(result.role).toBe('user');
|
||||
expect(result.content).toContain('[Voice message]:');
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,41 @@ const SUPPORTED_IMAGE_TYPES = new Set([
|
||||
'image/webp',
|
||||
]);
|
||||
|
||||
/** MIME types that are audio (not image). */
|
||||
const SUPPORTED_AUDIO_TYPES = new Set([
|
||||
'audio/ogg',
|
||||
'audio/mpeg',
|
||||
'audio/mp3',
|
||||
'audio/wav',
|
||||
'audio/webm',
|
||||
'audio/mp4',
|
||||
'audio/x-m4a',
|
||||
]);
|
||||
|
||||
/** Check whether an attachment is a supported image type. */
|
||||
export function isSupportedImage(attachment: Attachment): boolean {
|
||||
return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
|
||||
}
|
||||
|
||||
/** Check whether an attachment is a supported audio type. */
|
||||
export function isSupportedAudio(attachment: Attachment): boolean {
|
||||
return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
|
||||
}
|
||||
|
||||
/** Convert MIME type to file extension. */
|
||||
export function mimeToExtension(mime: string): string {
|
||||
const map: Record<string, string> = {
|
||||
'audio/ogg': 'ogg',
|
||||
'audio/mpeg': 'mp3',
|
||||
'audio/mp3': 'mp3',
|
||||
'audio/wav': 'wav',
|
||||
'audio/webm': 'webm',
|
||||
'audio/mp4': 'm4a',
|
||||
'audio/x-m4a': 'm4a',
|
||||
};
|
||||
return map[mime] ?? 'bin';
|
||||
}
|
||||
|
||||
/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
|
||||
export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
|
||||
if (!isSupportedImage(attachment)) {
|
||||
@@ -90,6 +120,103 @@ export function getMessageText(message: Message): string {
|
||||
.join('');
|
||||
}
|
||||
|
||||
/** Configuration for audio transcription via Whisper-compatible API. */
|
||||
export interface AudioTranscriptionConfig {
|
||||
/** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
|
||||
endpoint?: string;
|
||||
/** API key for the transcription service */
|
||||
apiKey?: string;
|
||||
/** Model name (default: "whisper-1") */
|
||||
model?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
|
||||
* Falls back to a placeholder message if no transcription endpoint is configured.
|
||||
*/
|
||||
export async function transcribeAudio(
|
||||
attachment: Attachment,
|
||||
config?: AudioTranscriptionConfig,
|
||||
): Promise<string> {
|
||||
if (!config?.endpoint) {
|
||||
return '[Audio message received but no transcription service is configured]';
|
||||
}
|
||||
|
||||
try {
|
||||
const audioBuffer = Buffer.from(attachment.data!, 'base64');
|
||||
const ext = mimeToExtension(attachment.mimeType);
|
||||
const formData = new FormData();
|
||||
formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
|
||||
formData.append('model', config.model ?? 'whisper-1');
|
||||
|
||||
const headers: Record<string, string> = {};
|
||||
if (config.apiKey) {
|
||||
headers['Authorization'] = `Bearer ${config.apiKey}`;
|
||||
}
|
||||
|
||||
const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
|
||||
if (!res.ok) {
|
||||
throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
|
||||
}
|
||||
const json = await res.json() as { text: string };
|
||||
return json.text;
|
||||
} catch (error) {
|
||||
console.error(
|
||||
`Failed to transcribe audio (${attachment.mimeType}):`,
|
||||
error instanceof Error ? error.message : 'Unknown error',
|
||||
);
|
||||
return '[Audio message transcription failed]';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a multimodal Message from text + attachments, with optional audio transcription.
|
||||
* Audio attachments are transcribed to text and prepended to the message.
|
||||
* Image attachments are converted to content parts as before.
|
||||
*/
|
||||
export async function buildUserMessageWithAudio(
|
||||
text: string,
|
||||
attachments?: Attachment[],
|
||||
audioConfig?: AudioTranscriptionConfig,
|
||||
): Promise<Message> {
|
||||
const imageParts: MessageContentPart[] = [];
|
||||
|
||||
// Separate image and audio attachments
|
||||
const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
|
||||
const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));
|
||||
|
||||
// Transcribe audio attachments and prepend to text (only if config is provided)
|
||||
let processedText = text;
|
||||
if (audioConfig?.endpoint) {
|
||||
for (const audioAttachment of audioAttachments) {
|
||||
const transcript = await transcribeAudio(audioAttachment, audioConfig);
|
||||
processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert image attachments to content parts
|
||||
for (const att of imageAttachments) {
|
||||
const source = attachmentToImageSource(att);
|
||||
if (source) {
|
||||
imageParts.push({ type: 'image', source });
|
||||
}
|
||||
}
|
||||
|
||||
// No images or audio — return simple text message
|
||||
if (imageParts.length === 0) {
|
||||
return { role: 'user', content: processedText };
|
||||
}
|
||||
|
||||
// Build multimodal content: text first, then images
|
||||
const parts: MessageContentPart[] = [];
|
||||
if (processedText) {
|
||||
parts.push({ type: 'text', text: processedText });
|
||||
}
|
||||
parts.push(...imageParts);
|
||||
|
||||
return { role: 'user', content: parts };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether a message contains image content parts.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user