diff --git a/docs/plans/state.json b/docs/plans/state.json index 1cf8a2c..d1cdb9b 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -1089,7 +1089,7 @@ }, "overall_progress": { - "total_test_count": 1331, + "total_test_count": 1369, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", @@ -1107,6 +1107,7 @@ "feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)", "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next", "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", + "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback", "next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items" }, "soul_md_and_cron_create": { @@ -1137,6 +1138,56 @@ "src/backends/native/agent.test.ts" ] }, + "native-audio-support": { + "status": "completed", + "date": "2026-02-11", + "summary": "Native audio input support — voice messages passed directly to audio-capable models (Gemini, OpenAI, GitHub) instead of always transcribing via Whisper. Smart routing decides per-model whether to pass raw audio or transcribe first.", + "phases": { + "audio_transcribe_tool": { + "status": "completed", + "description": "audio.transcribe tool with Whisper-compatible API support", + "files_created": [ + "src/tools/builtin/audio-transcribe.ts" + ] + }, + "type_system_and_clients": { + "status": "completed", + "description": "AudioSource type, audio content part handling in all model clients (Gemini inlineData, OpenAI input_audio, GitHub input_audio = native; Anthropic, Bedrock = text fallback)", + "files_modified": [ + "src/models/types.ts", + "src/models/gemini.ts", + "src/models/openai.ts", + "src/models/github.ts", + "src/models/anthropic.ts", + "src/models/bedrock.ts", + "src/models/media.ts" + ] + }, + "capabilities_and_routing": { + "status": "completed", + "description": "supportsAudioInput() capability check, smart routing in daemon that transcribes for non-audio models and passes raw audio for capable ones, supports_audio config override", + "files_created": [ + "src/models/capabilities.ts", + "src/models/capabilities.test.ts" + ], + "files_modified": [ + "src/daemon/routing.ts", + "src/config/schema.ts" + ], + "test_status": "18/18 passing" + }, + "tests_and_token_estimation": { + "status": "completed", + "description": "Audio tests for media helpers, audio token estimation (base64→bytes→duration→tokens at 32 tokens/sec), supports_audio config override wiring", + "files_modified": [ + "src/models/media.test.ts", + "src/context/tokens.ts", + "src/context/tokens.test.ts" + ], + "test_status": "20/20 tokens tests, 87/87 media tests" + } + } + }, "stopreason-normalization": { "date": "2026-02-11", "summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.", diff --git a/src/config/schema.ts b/src/config/schema.ts index a043e62..0b57612 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -52,6 +52,7 @@ const modelConfigBaseSchema = z.object({ for: z.array(z.string()).optional(), num_gpu: z.number().optional(), context_window: z.number().optional(), + supports_audio: z.boolean().optional(), }); const modelConfigSchema = modelConfigBaseSchema.extend({ @@ -314,10 +315,16 @@ const webSearchSchema = z.object({ max_results: z.number().min(1).max(20).default(5), }).default({}); +const audioProviderSchema = z.object({ + type: z.enum(['openai', 'groq', 'ollama', 'llamacpp', 'custom']), + endpoint: z.string().optional(), + api_key: z.string().optional(), + model: z.string().optional(), +}); + const audioSchema = z.object({ - transcription_endpoint: z.string().optional(), - transcription_api_key: z.string().optional(), - transcription_model: z.string().default('whisper-1'), + enabled: z.boolean().default(false), + provider: audioProviderSchema.optional(), }).default({}); // ── Tool policy schemas ────────────────────────────────────────────── diff --git a/src/context/tokens.test.ts b/src/context/tokens.test.ts index 0f1ae6e..d68a110 100644 --- a/src/context/tokens.test.ts +++ b/src/context/tokens.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { estimateTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js'; +import { estimateTokens, estimateAudioTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js'; describe('estimateTokens', () => { it('returns 0 for empty string', () => { @@ -20,6 +20,33 @@ describe('estimateTokens', () => { }); }); +describe('estimateAudioTokens', () => { + it('returns positive number for valid audio data', () => { + // 10000 base64 chars → ~7500 bytes → ~3.75s → ceil(3.75 * 32) = 120 + const source = { media_type: 'audio/ogg', data: 'A'.repeat(10000) }; + const tokens = estimateAudioTokens(source); + expect(tokens).toBeGreaterThan(0); + expect(tokens).toBe(120); + }); + + it('returns at least 1 for very short audio', () => { + // 1 byte of base64 data → very tiny duration, but minimum is 1 + const source = { media_type: 'audio/ogg', data: 'A' }; + expect(estimateAudioTokens(source)).toBe(1); + }); + + it('returns 0 for empty audio data', () => { + const source = { media_type: 'audio/ogg', data: '' }; + expect(estimateAudioTokens(source)).toBe(0); + }); + + it('longer audio data produces more tokens', () => { + const short = { media_type: 'audio/ogg', data: 'A'.repeat(1000) }; + const long = { media_type: 'audio/ogg', data: 'A'.repeat(100000) }; + expect(estimateAudioTokens(long)).toBeGreaterThan(estimateAudioTokens(short)); + }); +}); + describe('estimateMessageTokens', () => { it('returns 0 for empty array', () => { expect(estimateMessageTokens([])).toBe(0); @@ -38,6 +65,23 @@ describe('estimateMessageTokens', () => { ]; expect(estimateMessageTokens(messages)).toBe(10); }); + + it('includes audio token estimate for multimodal messages', () => { + // Text part: 'hello' = 5 chars → ceil(5/4) = 2 text tokens + // Audio part: 10000 base64 chars → 120 audio tokens (see estimateAudioTokens test) + // Overhead: 4 + // Total: 2 + 120 + 4 = 126 + const messages = [ + { + role: 'user' as const, + content: [ + { type: 'text' as const, text: 'hello' }, + { type: 'audio' as const, source: { media_type: 'audio/ogg', data: 'A'.repeat(10000) } }, + ], + }, + ]; + expect(estimateMessageTokens(messages)).toBe(126); + }); }); describe('getContextWindow', () => { diff --git a/src/context/tokens.ts b/src/context/tokens.ts index b3358a6..06e6a55 100644 --- a/src/context/tokens.ts +++ b/src/context/tokens.ts @@ -1,4 +1,4 @@ -import type { Message } from '../models/types.js'; +import type { Message, AudioSource } from '../models/types.js'; import { getMessageText } from '../models/media.js'; /** @@ -36,6 +36,25 @@ export function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } +/** + * Estimate token count for an audio content part. + * + * Heuristic: + * 1. Decode base64 length to bytes: `base64Length * 0.75` + * 2. Assume ~16 kbps bitrate (typical voice OGG/Opus): `bytes / 2000` → seconds + * 3. Estimate ~32 tokens per second of audio (Gemini-style rate) + * + * Returns at least 1 token for any non-empty audio data. + */ +export function estimateAudioTokens(audioSource: AudioSource): number { + const base64Length = audioSource.data.length; + if (base64Length === 0) { + return 0; + } + const durationSeconds = (base64Length * 0.75) / 2000; + return Math.max(1, Math.ceil(durationSeconds * 32)); +} + /** * Estimate the total token count for an array of messages. * @@ -43,10 +62,20 @@ export function estimateTokens(text: string): number { * overhead of ~4 tokens to account for the role marker and separators. */ export function estimateMessageTokens(messages: Message[]): number { - return messages.reduce( - (sum, msg) => sum + estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS, - 0, - ); + return messages.reduce((sum, msg) => { + let tokens = estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS; + + // Add audio token estimates for multimodal messages + if (Array.isArray(msg.content)) { + for (const part of msg.content) { + if (part.type === 'audio') { + tokens += estimateAudioTokens(part.source); + } + } + } + + return sum + tokens; + }, 0); } /** diff --git a/src/daemon/index.ts b/src/daemon/index.ts index 8737b06..8d40cbb 100644 --- a/src/daemon/index.ts +++ b/src/daemon/index.ts @@ -5,7 +5,6 @@ import { mkdirSync } from 'fs'; // ── Config & Types ── import type { Config } from '../config/index.js'; -import type { AudioTranscriptionConfig } from '../models/media.js'; import type { ToolRegistry, ToolExecutor, BrowserManager } from '../tools/index.js'; import type { AgentConfigRegistry, AgentRouter } from '../agents/index.js'; import type { SandboxManager } from '../sandbox/index.js'; @@ -100,12 +99,6 @@ export async function startDaemon(config: Config): Promise { const { skillRegistry, skillInstaller } = initSkills(config); const { agentConfigRegistry, agentRouter, sandboxManager } = await initAgents({ config, lifecycle }); - // ── Model & Prompt ── - const audioConfig: AudioTranscriptionConfig = { - endpoint: config.audio.transcription_endpoint, - apiKey: config.audio.transcription_api_key, - model: config.audio.transcription_model, - }; const modelRouter = createModelRouter(config); // Restore persisted model tier @@ -133,7 +126,7 @@ export async function startDaemon(config: Config): Promise { const messageRouter = createMessageRouter({ sessionManager, modelRouter, systemPrompt, toolRegistry, toolExecutor, - config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager, audioConfig, + config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager, }); channelRegistry.setMessageHandler(messageRouter.handler); channelAgents = messageRouter.agents; diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index 657d694..fafa98e 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -1,6 +1,7 @@ import type { AudioTranscriptionConfig } from '../models/media.js'; import type { Attachment } from '../channels/types.js'; import { isSupportedAudio, transcribeAudio } from '../models/media.js'; +import { supportsAudioInput } from '../models/capabilities.js'; import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js'; import { OutboundAttachmentCollector } from '../backends/native/attachments.js'; import type { InboundMessage, OutboundMessage } from '../channels/index.js'; @@ -32,7 +33,6 @@ export function createMessageRouter(deps: { agentConfigRegistry?: AgentConfigRegistry; agentRouter?: AgentRouter; sandboxManager?: SandboxManager; - audioConfig?: AudioTranscriptionConfig; }): { handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise) => Promise; agents: Map; @@ -213,18 +213,56 @@ export function createMessageRouter(deps: { } try { - // Transcribe audio attachments before processing - let messageText = msg.text; - const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); - - if (audioAttachments.length > 0 && deps.audioConfig) { - for (const att of audioAttachments) { - const transcript = await transcribeAudio(att, deps.audioConfig); - messageText = `[Voice message]: ${transcript}\n\n${messageText}`; + // Determine if the active model supports native audio input + let effectiveTier: string = deps.config.agents.primary_tier ?? 'default'; + if (msg.metadata?.modelTier) { + effectiveTier = msg.metadata.modelTier as string; + } else if (deps.agentRouter && deps.agentConfigRegistry) { + const agentName = deps.agentRouter.resolve(msg.channel, msg.senderId); + if (agentName) { + const agentCfg = deps.agentConfigRegistry.get(agentName); + if (agentCfg?.modelTier) { + effectiveTier = agentCfg.modelTier; + } } } - const response = await agent.process(messageText, msg.attachments); + // Look up provider/model for the effective tier + const modelsConfig = deps.config.models as Record; + const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default; + const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider; + const modelName = tierConfig?.model ?? deps.config.models.default.model; + const supportsAudioOverride = (tierConfig as Record | undefined)?.supports_audio as boolean | undefined; + const nativeAudioSupported = supportsAudioInput(modelProvider, modelName, supportsAudioOverride); + + let messageText = msg.text; + let attachments = msg.attachments; + const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); + + if (audioAttachments.length > 0 && !nativeAudioSupported) { + // Model doesn't support native audio — transcribe via Whisper and strip audio attachments + const audioConfig: AudioTranscriptionConfig | undefined = deps.config.audio?.enabled && deps.config.audio.provider + ? { + endpoint: deps.config.audio.provider.endpoint, + apiKey: deps.config.audio.provider.api_key, + model: deps.config.audio.provider.model, + } + : undefined; + + if (audioConfig?.endpoint) { + for (const att of audioAttachments) { + const transcript = await transcribeAudio(att, audioConfig); + messageText = `[Voice message]: ${transcript}\n\n${messageText}`; + } + } + // Remove audio attachments so buildUserMessage doesn't create audio content parts + attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a)); + if (attachments.length === 0) { attachments = undefined; } + } + // If native audio IS supported, we pass attachments through unchanged — + // buildUserMessage() in the agent will create native audio content parts + + const response = await agent.process(messageText, attachments); const outboundAttachments = collector.drain(); await reply({ text: response, diff --git a/src/daemon/tools.ts b/src/daemon/tools.ts index 0c55d9a..ab81304 100644 --- a/src/daemon/tools.ts +++ b/src/daemon/tools.ts @@ -1,7 +1,8 @@ import type { Config } from '../config/index.js'; import type { Lifecycle } from './lifecycle.js'; +import type { AudioTranscriptionConfig } from '../models/media.js'; import { HookEngine } from '../hooks/index.js'; -import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js'; +import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createAudioTranscribeTool } from '../tools/index.js'; export interface ToolsDeps { config: Config; @@ -52,6 +53,18 @@ export function initTools(deps: ToolsDeps): ToolsResult { console.log('Process manager stopped'); }); + // Register audio transcription tool if configured + if (config.audio?.enabled && config.audio.provider) { + const audioConfig: AudioTranscriptionConfig = { + endpoint: config.audio.provider.endpoint, + apiKey: config.audio.provider.api_key, + model: config.audio.provider.model, + }; + const audioTool = createAudioTranscribeTool(audioConfig); + toolRegistry.register(audioTool); + console.log(`Audio transcription enabled (type=${config.audio.provider.type}, endpoint=${audioConfig.endpoint})`); + } + // Initialize browser manager and register browser tools (if enabled) let browserManager: BrowserManager | undefined; if (config.browser?.enabled) { diff --git a/src/models/anthropic.ts b/src/models/anthropic.ts index 47a5356..a35a324 100644 --- a/src/models/anthropic.ts +++ b/src/models/anthropic.ts @@ -41,6 +41,13 @@ function toAnthropicContent(content: string | MessageContentPart[]): string | un }, }; } + // Audio — Anthropic doesn't support native audio input; use transcript fallback + if (part.type === 'audio') { + if (part.source.transcript) { + return { type: 'text', text: `[Voice message]: ${part.source.transcript}` }; + } + return { type: 'text', text: '[Audio message received but no transcript available]' }; + } return part; }); } diff --git a/src/models/bedrock.ts b/src/models/bedrock.ts index a15825b..c2e0a26 100644 --- a/src/models/bedrock.ts +++ b/src/models/bedrock.ts @@ -170,17 +170,27 @@ function convertMessages(messages: Message[]): BedrockMessage[] { if (part.type === 'text') { return { text: part.text } as ContentBlock; } - // Image part — Bedrock uses { image: { format, source: { bytes } } } - if (part.source.type === 'base64' && part.source.data) { - return { - image: { - format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp', - source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) }, - }, - } as unknown as ContentBlock; + if (part.type === 'image') { + // Image part — Bedrock uses { image: { format, source: { bytes } } } + if (part.source.type === 'base64' && part.source.data) { + return { + image: { + format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp', + source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) }, + }, + } as unknown as ContentBlock; + } + // URL images not natively supported by Bedrock — fall back to text placeholder + return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock; } - // URL images not natively supported by Bedrock — fall back to text placeholder - return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock; + // Audio — Bedrock doesn't support native audio input; use transcript fallback + if (part.type === 'audio') { + if (part.source.transcript) { + return { text: `[Voice message]: ${part.source.transcript}` } as ContentBlock; + } + return { text: '[Audio message received but no transcript available]' } as ContentBlock; + } + return { text: JSON.stringify(part) } as ContentBlock; }); return { role, content: blocks }; diff --git a/src/models/capabilities.test.ts b/src/models/capabilities.test.ts new file mode 100644 index 0000000..824ba43 --- /dev/null +++ b/src/models/capabilities.test.ts @@ -0,0 +1,60 @@ +import { describe, it, expect } from 'vitest'; +import { supportsAudioInput } from './capabilities.js'; + +describe('supportsAudioInput', () => { + describe('audio-capable providers with modern models', () => { + it('returns true for gemini with a modern model', () => { + expect(supportsAudioInput('gemini', 'gemini-1.5-pro')).toBe(true); + }); + + it('returns true for openai with a modern model', () => { + expect(supportsAudioInput('openai', 'gpt-4o')).toBe(true); + }); + + it('returns true for github with a modern model', () => { + expect(supportsAudioInput('github', 'gpt-4o')).toBe(true); + }); + }); + + describe('non-audio providers return false', () => { + const nonAudioProviders = [ + 'anthropic', + 'bedrock', + 'ollama', + 'llamacpp', + 'openrouter', + 'zhipuai', + 'xai', + ] as const; + + for (const provider of nonAudioProviders) { + it(`returns false for ${provider}`, () => { + expect(supportsAudioInput(provider, 'some-model')).toBe(false); + }); + } + }); + + describe('model-specific exclusions', () => { + const excludedModels = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo']; + + for (const model of excludedModels) { + it(`returns false for openai/${model} despite provider being capable`, () => { + expect(supportsAudioInput('openai', model)).toBe(false); + }); + + it(`returns false for github/${model} despite provider being capable`, () => { + expect(supportsAudioInput('github', model)).toBe(false); + }); + } + }); + + describe('unknown provider', () => { + it('returns false for a completely unknown provider', () => { + expect(supportsAudioInput('unknown-provider', 'some-model')).toBe(false); + }); + + it('returns false for an empty string provider', () => { + expect(supportsAudioInput('', 'some-model')).toBe(false); + }); + }); +}); diff --git a/src/models/capabilities.ts b/src/models/capabilities.ts new file mode 100644 index 0000000..70dc91f --- /dev/null +++ b/src/models/capabilities.ts @@ -0,0 +1,48 @@ +/** + * Model capability detection for native audio input support. + * + * Models that support native audio will receive raw audio data directly. + * Models that don't will receive a Whisper transcript as text instead. + */ + +export type ModelProvider = 'anthropic' | 'openai' | 'gemini' | 'bedrock' | 'github' | 'ollama' | 'llamacpp' | 'openrouter' | 'zhipuai' | 'xai'; + +/** Providers that support native audio input in their API. */ +const AUDIO_CAPABLE_PROVIDERS = new Set([ + 'gemini', + 'openai', + 'github', // GitHub Models uses OpenAI-compatible API +]); + +/** + * Models known NOT to support audio despite their provider supporting it. + * For example, older OpenAI models or specialized models. + */ +const AUDIO_INCAPABLE_MODELS = new Set([ + // Older OpenAI models that predate audio input support + 'gpt-3.5-turbo', + 'gpt-4', + 'gpt-4-turbo', +]); + +/** + * Check whether a provider+model combination supports native audio input. + * + * Returns true if the model can receive raw audio data directly via its API, + * false if audio must be transcribed to text before sending. + */ +export function supportsAudioInput(provider: string, model: string, override?: boolean): boolean { + if (override !== undefined) return override; + + // Provider must be in the capable set + if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) { + return false; + } + + // Check model-specific exclusions + if (AUDIO_INCAPABLE_MODELS.has(model)) { + return false; + } + + return true; +} diff --git a/src/models/gemini.ts b/src/models/gemini.ts index bc7b63e..e055a14 100644 --- a/src/models/gemini.ts +++ b/src/models/gemini.ts @@ -188,6 +188,15 @@ function convertMessages(messages: Message[]): Content[] { // so we pass as a text description. In production, you'd want to fetch + base64 encode. return { text: `[Image: ${part.source.url ?? 'unavailable'}]` }; } + // Audio part — Gemini supports native audio via inlineData (same format as images) + if (part.type === 'audio') { + return { + inlineData: { + mimeType: part.source.media_type, + data: part.source.data, + }, + }; + } return { text: JSON.stringify(part) }; }); diff --git a/src/models/github.ts b/src/models/github.ts index 38ffeab..65d3d38 100644 --- a/src/models/github.ts +++ b/src/models/github.ts @@ -36,6 +36,23 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA : part.source.url!; return { type: 'image_url', image_url: { url } }; } + if (part.type === 'audio') { + // GitHub Models uses OpenAI-compatible API — native audio via input_audio + const formatMap: Record = { + 'audio/wav': 'wav', + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/ogg': 'ogg', + 'audio/webm': 'webm', + 'audio/mp4': 'mp4', + 'audio/x-m4a': 'mp4', + }; + const format = formatMap[part.source.media_type] ?? 'wav'; + return { + type: 'input_audio', + input_audio: { data: part.source.data, format }, + } as unknown as OpenAI.ChatCompletionContentPart; + } // Fallback — shouldn't happen return { type: 'text', text: JSON.stringify(part) }; }); diff --git a/src/models/index.ts b/src/models/index.ts index 3dfac1a..72574a0 100644 --- a/src/models/index.ts +++ b/src/models/index.ts @@ -8,17 +8,23 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js'; export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js'; export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js'; export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js'; +export { supportsAudioInput } from './capabilities.js'; export { isSupportedImage, + isSupportedAudio, attachmentToImageSource, + attachmentToAudioSource, buildUserMessage, getMessageText, hasImages, + hasAudio, + stripAudioParts, } from './media.js'; export type { Message, MessageContentPart, ImageSource, + AudioSource, ChatRequest, ChatResponse, ChatStreamEvent, diff --git a/src/models/media.test.ts b/src/models/media.test.ts index 5dcaff6..95e251b 100644 --- a/src/models/media.test.ts +++ b/src/models/media.test.ts @@ -6,11 +6,14 @@ import { isSupportedImage, isSupportedAudio, attachmentToImageSource, + attachmentToAudioSource, buildUserMessage, getMessageText, getMessageTextWithTools, normalizeMessagesForLocal, hasImages, + hasAudio, + stripAudioParts, transcribeAudio, buildUserMessageWithAudio, type AudioTranscriptionConfig, @@ -820,3 +823,212 @@ describe('normalizeMessagesForLocal', () => { ]); }); }); + +// --------------------------------------------------------------------------- +// 12. attachmentToAudioSource +// --------------------------------------------------------------------------- + +describe('attachmentToAudioSource', () => { + // Positive: supported audio type with data returns AudioSource. + it('returns AudioSource for supported audio type with data', () => { + const result = attachmentToAudioSource(oggAudioAttachment); + + expect(result).toEqual({ + media_type: 'audio/ogg', + data: 'AAAAAAAAAAAAAAAAAAAA', + }); + }); + + // Negative: unsupported MIME type returns null. + it('returns null for unsupported mime type', () => { + const result = attachmentToAudioSource(pdfAttachment); + + expect(result).toBeNull(); + }); + + // Negative: supported audio type but no data returns null. + it('returns null when no data present', () => { + const noDataAudio = makeAttachment({ + mimeType: 'audio/ogg', + filename: 'voice.ogg', + }); + + const result = attachmentToAudioSource(noDataAudio); + + expect(result).toBeNull(); + }); + + // Negative: image attachment returns null. + it('returns null for image attachment', () => { + const result = attachmentToAudioSource(jpegBase64Attachment); + + expect(result).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// 13. hasAudio +// --------------------------------------------------------------------------- + +describe('hasAudio', () => { + // Negative: string content never has audio. + it('returns false for string content messages', () => { + const msg: Message = { role: 'user', content: 'no audio here' }; + + expect(hasAudio(msg)).toBe(false); + }); + + // Negative: multimodal messages with only text parts have no audio. + it('returns false for multimodal messages with only text parts', () => { + const msg: Message = { + role: 'user', + content: [{ type: 'text', text: 'just text' }], + }; + + expect(hasAudio(msg)).toBe(false); + }); + + // Negative: multimodal messages with only image parts have no audio. + it('returns false for multimodal messages with only image parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } }, + ], + }; + + expect(hasAudio(msg)).toBe(false); + }); + + // Positive: multimodal messages with audio parts are detected. + it('returns true for multimodal messages with audio parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } }, + ], + }; + + expect(hasAudio(msg)).toBe(true); + }); + + // Positive: multimodal messages with mixed image + audio parts are detected. + it('returns true for multimodal messages with mixed image+audio parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'img' } }, + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } }, + ], + }; + + expect(hasAudio(msg)).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// 14. stripAudioParts +// --------------------------------------------------------------------------- + +describe('stripAudioParts', () => { + // String content passes through unchanged. + it('returns unchanged message for string content', () => { + const msg: Message = { role: 'user', content: 'plain text' }; + + const result = stripAudioParts(msg); + + expect(result).toEqual({ role: 'user', content: 'plain text' }); + }); + + // Audio part with transcript is replaced with transcript text. + it('replaces audio part with transcript text when transcript is present', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'Check this out' }, + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hello world' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(result.role).toBe('user'); + expect(Array.isArray(result.content)).toBe(true); + const parts = result.content as Array<{ type: string; text?: string }>; + expect(parts).toHaveLength(2); + expect(parts[0]).toEqual({ type: 'text', text: 'Check this out' }); + expect(parts[1]).toEqual({ type: 'text', text: '[Voice message]: Hello world' }); + }); + + // Audio part without transcript is replaced with placeholder. + it('replaces audio part with placeholder when no transcript', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'Listen' }, + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(Array.isArray(result.content)).toBe(true); + const parts = result.content as Array<{ type: string; text?: string }>; + expect(parts).toHaveLength(2); + expect(parts[0]).toEqual({ type: 'text', text: 'Listen' }); + expect(parts[1]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' }); + }); + + // Non-audio parts (text + image) are kept unchanged. + it('keeps non-audio parts unchanged', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'caption' }, + { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(result.content).toEqual([ + { type: 'text', text: 'caption' }, + { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } }, + ]); + }); + + // Simplifies to string content when only one text part remains after stripping. + it('simplifies to string content when only one text part remains after stripping', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hi there' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(result).toEqual({ role: 'user', content: '[Voice message]: Hi there' }); + }); + + // Handles message with multiple audio parts. + it('handles message with multiple audio parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'First message' } }, + { type: 'text', text: 'in between' }, + { type: 'audio', source: { media_type: 'audio/mpeg', data: 'BBBB' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(Array.isArray(result.content)).toBe(true); + const parts = result.content as Array<{ type: string; text?: string }>; + expect(parts).toHaveLength(3); + expect(parts[0]).toEqual({ type: 'text', text: '[Voice message]: First message' }); + expect(parts[1]).toEqual({ type: 'text', text: 'in between' }); + expect(parts[2]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' }); + }); +}); diff --git a/src/models/media.ts b/src/models/media.ts index 888d079..64a97b9 100644 --- a/src/models/media.ts +++ b/src/models/media.ts @@ -3,7 +3,7 @@ */ import type { Attachment } from '../channels/types.js'; -import type { MessageContentPart, ImageSource, Message } from './types.js'; +import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js'; /** MIME types that vision models generally accept. */ const SUPPORTED_IMAGE_TYPES = new Set([ @@ -73,34 +73,55 @@ export function attachmentToImageSource(attachment: Attachment): ImageSource | n return null; } +/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */ +export function attachmentToAudioSource(attachment: Attachment): AudioSource | null { + if (!isSupportedAudio(attachment)) { + return null; + } + + if (!attachment.data) { + return null; + } + + return { + media_type: attachment.mimeType, + data: attachment.data, + }; +} + /** * Build a multimodal Message from text + attachments. - * If there are no image attachments, returns a plain text Message. - * If there are image attachments, returns a Message with structured content parts. + * If there are no image or audio attachments, returns a plain text Message. + * If there are image/audio attachments, returns a Message with structured content parts. */ export function buildUserMessage(text: string, attachments?: Attachment[]): Message { - const imageParts: MessageContentPart[] = []; + const mediaParts: MessageContentPart[] = []; if (attachments) { for (const att of attachments) { - const source = attachmentToImageSource(att); - if (source) { - imageParts.push({ type: 'image', source }); + const imageSource = attachmentToImageSource(att); + if (imageSource) { + mediaParts.push({ type: 'image', source: imageSource }); + continue; + } + const audioSource = attachmentToAudioSource(att); + if (audioSource) { + mediaParts.push({ type: 'audio', source: audioSource }); } } } - // No images — return simple text message (preserves backward compat) - if (imageParts.length === 0) { + // No media — return simple text message (preserves backward compat) + if (mediaParts.length === 0) { return { role: 'user', content: text }; } - // Build multimodal content: text first, then images + // Build multimodal content: text first, then media const parts: MessageContentPart[] = []; if (text) { parts.push({ type: 'text', text }); } - parts.push(...imageParts); + parts.push(...mediaParts); return { role: 'user', content: parts }; } @@ -148,6 +169,13 @@ export function getMessageTextWithTools(message: Message): string { const content = (block.content as string) ?? ''; const isError = block.is_error ? ' (error)' : ''; parts.push(`[Tool result${isError}: ${content}]`); + } else if (block.type === 'audio') { + const source = block.source as Record; + if (source?.transcript) { + parts.push(`[Voice message]: ${source.transcript}`); + } else { + parts.push('[Audio attachment]'); + } } } return parts.join('\n'); @@ -298,3 +326,44 @@ export function hasImages(message: Message): boolean { } return message.content.some(p => p.type === 'image'); } + +/** + * Check whether a message contains audio content parts. + */ +export function hasAudio(message: Message): boolean { + if (typeof message.content === 'string') { + return false; + } + return message.content.some(p => p.type === 'audio'); +} + +/** + * Strip audio parts from a message, replacing them with their transcripts as text. + * Used for model providers that don't support native audio input (Anthropic, Bedrock, local). + */ +export function stripAudioParts(message: Message): Message { + if (typeof message.content === 'string') { + return message; + } + + const newParts: MessageContentPart[] = []; + for (const part of message.content) { + if (part.type === 'audio') { + // Replace audio with transcript text if available + if (part.source.transcript) { + newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` }); + } else { + newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' }); + } + } else { + newParts.push(part); + } + } + + // If all that's left is a single text part, simplify to string content + if (newParts.length === 1 && newParts[0].type === 'text') { + return { ...message, content: newParts[0].text }; + } + + return { ...message, content: newParts }; +} diff --git a/src/models/openai.ts b/src/models/openai.ts index c5f527e..92db24b 100644 --- a/src/models/openai.ts +++ b/src/models/openai.ts @@ -28,6 +28,24 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA : part.source.url!; return { type: 'image_url', image_url: { url } }; } + if (part.type === 'audio') { + // OpenAI native audio input via input_audio content part + // Determine format from MIME type (OpenAI supports: wav, mp3, flac, opus, ogg, webm) + const formatMap: Record = { + 'audio/wav': 'wav', + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/ogg': 'ogg', + 'audio/webm': 'webm', + 'audio/mp4': 'mp4', + 'audio/x-m4a': 'mp4', + }; + const format = formatMap[part.source.media_type] ?? 'wav'; + return { + type: 'input_audio', + input_audio: { data: part.source.data, format }, + } as unknown as OpenAI.ChatCompletionContentPart; + } // Fallback — shouldn't happen return { type: 'text', text: JSON.stringify(part) }; }); diff --git a/src/models/types.ts b/src/models/types.ts index b7a6079..462e1a1 100644 --- a/src/models/types.ts +++ b/src/models/types.ts @@ -9,10 +9,21 @@ export interface ImageSource { url?: string; } +/** Audio source for multimodal content blocks. */ +export interface AudioSource { + /** MIME type (e.g. "audio/ogg", "audio/mpeg", "audio/wav", "audio/webm"). */ + media_type: string; + /** Base64-encoded audio data. */ + data: string; + /** Optional transcript (from Whisper) — used when the model doesn't support native audio. */ + transcript?: string; +} + /** Individual content part within a multimodal message. */ export type MessageContentPart = | { type: 'text'; text: string } - | { type: 'image'; source: ImageSource }; + | { type: 'image'; source: ImageSource } + | { type: 'audio'; source: AudioSource }; export interface Message { role: 'user' | 'assistant'; @@ -43,6 +54,7 @@ export interface ModelToolCall { export type ContentBlock = | { type: 'text'; text: string } | { type: 'image'; source: ImageSource } + | { type: 'audio'; source: AudioSource } | { type: 'tool_use'; id: string; name: string; input: unknown }; // Tool result fed back into conversation diff --git a/src/tools/builtin/audio-transcribe.ts b/src/tools/builtin/audio-transcribe.ts new file mode 100644 index 0000000..4930920 --- /dev/null +++ b/src/tools/builtin/audio-transcribe.ts @@ -0,0 +1,214 @@ +import type { Tool, ToolResult } from '../types.js'; + +interface AudioTranscribeArgs { + data?: string; + url?: string; + mime_type?: string; + language?: string; + prompt?: string; +} + +const SUPPORTED_MIME_TYPES = new Set([ + 'audio/ogg', + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/webm', + 'audio/mp4', + 'audio/x-m4a', +]); + +const PROVIDER_ENDPOINTS: Record = { + openai: 'https://api.openai.com/v1/audio/transcriptions', + groq: 'https://api.groq.com/openai/v1/audio/transcriptions', + ollama: 'http://localhost:11434/api/generate', + llamacpp: 'http://localhost:8080/v1/audio/transcriptions', +}; + +function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } { + const hasData = args.data !== undefined && args.data !== ''; + const hasUrl = args.url !== undefined && args.url !== ''; + + if (!hasData && !hasUrl) { + return { valid: false, error: 'Either data or url must be provided' }; + } + + if (hasData && hasUrl) { + return { valid: false, error: 'Only one of data or url can be provided' }; + } + + if (hasData && !args.mime_type) { + return { valid: false, error: 'mime_type is required when using data' }; + } + + if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) { + return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` }; + } + + return { valid: true }; +} + +interface AudioTranscriptionConfig { + endpoint?: string; + apiKey?: string; + model?: string; +} + +export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool { + return { + name: 'audio.transcribe', + description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.', + inputSchema: { + type: 'object', + properties: { + data: { + type: 'string', + description: 'Base64-encoded audio data (alternative to url)', + }, + url: { + type: 'string', + description: 'URL to download to audio file (alternative to data)', + }, + mime_type: { + type: 'string', + description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)', + }, + language: { + type: 'string', + description: 'Language code (e.g., en, es, fr) - optional', + }, + prompt: { + type: 'string', + description: 'Optional text to guide transcription (OpenAI/Groq/custom only)', + }, + }, + }, + + execute: async (rawArgs: unknown): Promise => { + const args = rawArgs as AudioTranscribeArgs; + + const validation = validateInput(args); + if (!validation.valid) { + return { + success: false, + output: '', + error: validation.error, + }; + } + + if (!audioConfig?.endpoint) { + return { + success: false, + output: '', + error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml', + }; + } + + try { + let filename = 'audio.bin'; + let audioBlob: Blob | undefined; + + if (args.data) { + const rawBuffer = Buffer.from(args.data, 'base64'); + const audioBuffer = rawBuffer.buffer; + + const extMap: Record = { + 'audio/ogg': 'ogg', + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/wav': 'wav', + 'audio/webm': 'webm', + 'audio/mp4': 'm4a', + 'audio/x-m4a': 'm4a', + }; + const ext = extMap[args.mime_type!] || 'bin'; + filename = `audio.${ext}`; + + const mimeType = args.mime_type ?? 'audio/wav'; + audioBlob = new Blob([audioBuffer], { type: mimeType }); + } else if (args.url) { + const response = await fetch(args.url); + if (!response.ok) { + throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`); + } + const arrayBuffer = await response.arrayBuffer(); + + const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin'; + filename = `audio.${urlExt}`; + + audioBlob = new Blob([arrayBuffer], { type: 'audio/wav' }); + } + + const endpoint = audioConfig.endpoint; + const model = audioConfig.model ?? 'whisper-1'; + + if (endpoint === PROVIDER_ENDPOINTS.ollama) { + const ollamaResponse = await fetch(endpoint, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: model, + audio: args.data ?? args.url, + stream: false, + }), + }); + + if (!ollamaResponse.ok) { + throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`); + } + + const ollamaJson = await ollamaResponse.json() as { response?: string }; + return { + success: true, + output: ollamaJson.response ?? 'No response from Ollama', + }; + } + + const formData = new FormData(); + formData.append('file', audioBlob, filename); + formData.append('model', model); + + if (args.language) { + formData.append('language', args.language); + } + + if (args.prompt) { + formData.append('prompt', args.prompt); + } + + const fetchOptions: RequestInit = { + method: 'POST', + body: formData, + }; + + const headers: Record = {}; + if (audioConfig.apiKey) { + headers['Authorization'] = `Bearer ${audioConfig.apiKey}`; + } + + if (Object.keys(headers).length > 0) { + fetchOptions.headers = headers; + } + + const response = await fetch(endpoint, fetchOptions); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Transcription request failed (${response.status}): ${errorText}`); + } + + const json = await response.json() as { text: string }; + return { + success: true, + output: json.text, + }; + } catch (error) { + return { + success: false, + output: '', + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } + }, + }; +} diff --git a/src/tools/builtin/index.ts b/src/tools/builtin/index.ts index daabe78..e247ac6 100644 --- a/src/tools/builtin/index.ts +++ b/src/tools/builtin/index.ts @@ -8,6 +8,7 @@ export { systemInfoTool } from './system-info.js'; export { webFetchTool } from './web-fetch.js'; export { createMediaSendTool } from './media-send.js'; export { createImageAnalyzeTool } from './image-analyze.js'; +export { createAudioTranscribeTool } from './audio-transcribe.js'; export { createMemoryReadTool } from './memory-read.js'; export { createMemoryWriteTool } from './memory-write.js'; export { createMemorySearchTool } from './memory-search.js'; diff --git a/src/tools/index.ts b/src/tools/index.ts index 5a1ec7e..ff53526 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -5,7 +5,7 @@ export { ToolExecutor } from './executor.js'; export type { ToolExecutorConfig } from './executor.js'; export { ToolPolicy } from './policy.js'; export type { ToolPolicyContext } from './policy.js'; -export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js'; +export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createAudioTranscribeTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js'; export type { WebSearchConfig } from './builtin/web-search.js'; export type { ProcessManagerConfig } from './builtin/process/index.js'; export type { BrowserManagerConfig } from './builtin/browser/index.js';