Merge branch 'feature/native-audio-support'
This commit is contained in:
+52
-1
@@ -1089,7 +1089,7 @@
|
|||||||
},
|
},
|
||||||
|
|
||||||
"overall_progress": {
|
"overall_progress": {
|
||||||
"total_test_count": 1331,
|
"total_test_count": 1369,
|
||||||
"all_tests_passing": true,
|
"all_tests_passing": true,
|
||||||
"p0_completion": "3/3 (100%)",
|
"p0_completion": "3/3 (100%)",
|
||||||
"p1_completion": "4/4 (100%)",
|
"p1_completion": "4/4 (100%)",
|
||||||
@@ -1107,6 +1107,7 @@
|
|||||||
"feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)",
|
"feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)",
|
||||||
"operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next",
|
"operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next",
|
||||||
"gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
|
"gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
|
||||||
|
"native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback",
|
||||||
"next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
|
"next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
|
||||||
},
|
},
|
||||||
"soul_md_and_cron_create": {
|
"soul_md_and_cron_create": {
|
||||||
@@ -1137,6 +1138,56 @@
|
|||||||
"src/backends/native/agent.test.ts"
|
"src/backends/native/agent.test.ts"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
"native-audio-support": {
|
||||||
|
"status": "completed",
|
||||||
|
"date": "2026-02-11",
|
||||||
|
"summary": "Native audio input support — voice messages passed directly to audio-capable models (Gemini, OpenAI, GitHub) instead of always transcribing via Whisper. Smart routing decides per-model whether to pass raw audio or transcribe first.",
|
||||||
|
"phases": {
|
||||||
|
"audio_transcribe_tool": {
|
||||||
|
"status": "completed",
|
||||||
|
"description": "audio.transcribe tool with Whisper-compatible API support",
|
||||||
|
"files_created": [
|
||||||
|
"src/tools/builtin/audio-transcribe.ts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"type_system_and_clients": {
|
||||||
|
"status": "completed",
|
||||||
|
"description": "AudioSource type, audio content part handling in all model clients (Gemini inlineData, OpenAI input_audio, GitHub input_audio = native; Anthropic, Bedrock = text fallback)",
|
||||||
|
"files_modified": [
|
||||||
|
"src/models/types.ts",
|
||||||
|
"src/models/gemini.ts",
|
||||||
|
"src/models/openai.ts",
|
||||||
|
"src/models/github.ts",
|
||||||
|
"src/models/anthropic.ts",
|
||||||
|
"src/models/bedrock.ts",
|
||||||
|
"src/models/media.ts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"capabilities_and_routing": {
|
||||||
|
"status": "completed",
|
||||||
|
"description": "supportsAudioInput() capability check, smart routing in daemon that transcribes for non-audio models and passes raw audio for capable ones, supports_audio config override",
|
||||||
|
"files_created": [
|
||||||
|
"src/models/capabilities.ts",
|
||||||
|
"src/models/capabilities.test.ts"
|
||||||
|
],
|
||||||
|
"files_modified": [
|
||||||
|
"src/daemon/routing.ts",
|
||||||
|
"src/config/schema.ts"
|
||||||
|
],
|
||||||
|
"test_status": "18/18 passing"
|
||||||
|
},
|
||||||
|
"tests_and_token_estimation": {
|
||||||
|
"status": "completed",
|
||||||
|
"description": "Audio tests for media helpers, audio token estimation (base64→bytes→duration→tokens at 32 tokens/sec), supports_audio config override wiring",
|
||||||
|
"files_modified": [
|
||||||
|
"src/models/media.test.ts",
|
||||||
|
"src/context/tokens.ts",
|
||||||
|
"src/context/tokens.test.ts"
|
||||||
|
],
|
||||||
|
"test_status": "20/20 tokens tests, 87/87 media tests"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"stopreason-normalization": {
|
"stopreason-normalization": {
|
||||||
"date": "2026-02-11",
|
"date": "2026-02-11",
|
||||||
"summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",
|
"summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",
|
||||||
|
|||||||
+10
-3
@@ -52,6 +52,7 @@ const modelConfigBaseSchema = z.object({
|
|||||||
for: z.array(z.string()).optional(),
|
for: z.array(z.string()).optional(),
|
||||||
num_gpu: z.number().optional(),
|
num_gpu: z.number().optional(),
|
||||||
context_window: z.number().optional(),
|
context_window: z.number().optional(),
|
||||||
|
supports_audio: z.boolean().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
const modelConfigSchema = modelConfigBaseSchema.extend({
|
const modelConfigSchema = modelConfigBaseSchema.extend({
|
||||||
@@ -314,10 +315,16 @@ const webSearchSchema = z.object({
|
|||||||
max_results: z.number().min(1).max(20).default(5),
|
max_results: z.number().min(1).max(20).default(5),
|
||||||
}).default({});
|
}).default({});
|
||||||
|
|
||||||
|
const audioProviderSchema = z.object({
|
||||||
|
type: z.enum(['openai', 'groq', 'ollama', 'llamacpp', 'custom']),
|
||||||
|
endpoint: z.string().optional(),
|
||||||
|
api_key: z.string().optional(),
|
||||||
|
model: z.string().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
const audioSchema = z.object({
|
const audioSchema = z.object({
|
||||||
transcription_endpoint: z.string().optional(),
|
enabled: z.boolean().default(false),
|
||||||
transcription_api_key: z.string().optional(),
|
provider: audioProviderSchema.optional(),
|
||||||
transcription_model: z.string().default('whisper-1'),
|
|
||||||
}).default({});
|
}).default({});
|
||||||
|
|
||||||
// ── Tool policy schemas ──────────────────────────────────────────────
|
// ── Tool policy schemas ──────────────────────────────────────────────
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { describe, it, expect } from 'vitest';
|
import { describe, it, expect } from 'vitest';
|
||||||
import { estimateTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js';
|
import { estimateTokens, estimateAudioTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js';
|
||||||
|
|
||||||
describe('estimateTokens', () => {
|
describe('estimateTokens', () => {
|
||||||
it('returns 0 for empty string', () => {
|
it('returns 0 for empty string', () => {
|
||||||
@@ -20,6 +20,33 @@ describe('estimateTokens', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('estimateAudioTokens', () => {
|
||||||
|
it('returns positive number for valid audio data', () => {
|
||||||
|
// 10000 base64 chars → ~7500 bytes → ~3.75s → ceil(3.75 * 32) = 120
|
||||||
|
const source = { media_type: 'audio/ogg', data: 'A'.repeat(10000) };
|
||||||
|
const tokens = estimateAudioTokens(source);
|
||||||
|
expect(tokens).toBeGreaterThan(0);
|
||||||
|
expect(tokens).toBe(120);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns at least 1 for very short audio', () => {
|
||||||
|
// 1 byte of base64 data → very tiny duration, but minimum is 1
|
||||||
|
const source = { media_type: 'audio/ogg', data: 'A' };
|
||||||
|
expect(estimateAudioTokens(source)).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns 0 for empty audio data', () => {
|
||||||
|
const source = { media_type: 'audio/ogg', data: '' };
|
||||||
|
expect(estimateAudioTokens(source)).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('longer audio data produces more tokens', () => {
|
||||||
|
const short = { media_type: 'audio/ogg', data: 'A'.repeat(1000) };
|
||||||
|
const long = { media_type: 'audio/ogg', data: 'A'.repeat(100000) };
|
||||||
|
expect(estimateAudioTokens(long)).toBeGreaterThan(estimateAudioTokens(short));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe('estimateMessageTokens', () => {
|
describe('estimateMessageTokens', () => {
|
||||||
it('returns 0 for empty array', () => {
|
it('returns 0 for empty array', () => {
|
||||||
expect(estimateMessageTokens([])).toBe(0);
|
expect(estimateMessageTokens([])).toBe(0);
|
||||||
@@ -38,6 +65,23 @@ describe('estimateMessageTokens', () => {
|
|||||||
];
|
];
|
||||||
expect(estimateMessageTokens(messages)).toBe(10);
|
expect(estimateMessageTokens(messages)).toBe(10);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('includes audio token estimate for multimodal messages', () => {
|
||||||
|
// Text part: 'hello' = 5 chars → ceil(5/4) = 2 text tokens
|
||||||
|
// Audio part: 10000 base64 chars → 120 audio tokens (see estimateAudioTokens test)
|
||||||
|
// Overhead: 4
|
||||||
|
// Total: 2 + 120 + 4 = 126
|
||||||
|
const messages = [
|
||||||
|
{
|
||||||
|
role: 'user' as const,
|
||||||
|
content: [
|
||||||
|
{ type: 'text' as const, text: 'hello' },
|
||||||
|
{ type: 'audio' as const, source: { media_type: 'audio/ogg', data: 'A'.repeat(10000) } },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
|
expect(estimateMessageTokens(messages)).toBe(126);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('getContextWindow', () => {
|
describe('getContextWindow', () => {
|
||||||
|
|||||||
+34
-5
@@ -1,4 +1,4 @@
|
|||||||
import type { Message } from '../models/types.js';
|
import type { Message, AudioSource } from '../models/types.js';
|
||||||
import { getMessageText } from '../models/media.js';
|
import { getMessageText } from '../models/media.js';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -36,6 +36,25 @@ export function estimateTokens(text: string): number {
|
|||||||
return Math.ceil(text.length / 4);
|
return Math.ceil(text.length / 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Estimate token count for an audio content part.
|
||||||
|
*
|
||||||
|
* Heuristic:
|
||||||
|
* 1. Decode base64 length to bytes: `base64Length * 0.75`
|
||||||
|
* 2. Assume ~16 kbps bitrate (typical voice OGG/Opus): `bytes / 2000` → seconds
|
||||||
|
* 3. Estimate ~32 tokens per second of audio (Gemini-style rate)
|
||||||
|
*
|
||||||
|
* Returns at least 1 token for any non-empty audio data.
|
||||||
|
*/
|
||||||
|
export function estimateAudioTokens(audioSource: AudioSource): number {
|
||||||
|
const base64Length = audioSource.data.length;
|
||||||
|
if (base64Length === 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
const durationSeconds = (base64Length * 0.75) / 2000;
|
||||||
|
return Math.max(1, Math.ceil(durationSeconds * 32));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Estimate the total token count for an array of messages.
|
* Estimate the total token count for an array of messages.
|
||||||
*
|
*
|
||||||
@@ -43,10 +62,20 @@ export function estimateTokens(text: string): number {
|
|||||||
* overhead of ~4 tokens to account for the role marker and separators.
|
* overhead of ~4 tokens to account for the role marker and separators.
|
||||||
*/
|
*/
|
||||||
export function estimateMessageTokens(messages: Message[]): number {
|
export function estimateMessageTokens(messages: Message[]): number {
|
||||||
return messages.reduce(
|
return messages.reduce((sum, msg) => {
|
||||||
(sum, msg) => sum + estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS,
|
let tokens = estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS;
|
||||||
0,
|
|
||||||
);
|
// Add audio token estimates for multimodal messages
|
||||||
|
if (Array.isArray(msg.content)) {
|
||||||
|
for (const part of msg.content) {
|
||||||
|
if (part.type === 'audio') {
|
||||||
|
tokens += estimateAudioTokens(part.source);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum + tokens;
|
||||||
|
}, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
+1
-8
@@ -5,7 +5,6 @@ import { mkdirSync } from 'fs';
|
|||||||
|
|
||||||
// ── Config & Types ──
|
// ── Config & Types ──
|
||||||
import type { Config } from '../config/index.js';
|
import type { Config } from '../config/index.js';
|
||||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
|
||||||
import type { ToolRegistry, ToolExecutor, BrowserManager } from '../tools/index.js';
|
import type { ToolRegistry, ToolExecutor, BrowserManager } from '../tools/index.js';
|
||||||
import type { AgentConfigRegistry, AgentRouter } from '../agents/index.js';
|
import type { AgentConfigRegistry, AgentRouter } from '../agents/index.js';
|
||||||
import type { SandboxManager } from '../sandbox/index.js';
|
import type { SandboxManager } from '../sandbox/index.js';
|
||||||
@@ -100,12 +99,6 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
|||||||
const { skillRegistry, skillInstaller } = initSkills(config);
|
const { skillRegistry, skillInstaller } = initSkills(config);
|
||||||
const { agentConfigRegistry, agentRouter, sandboxManager } = await initAgents({ config, lifecycle });
|
const { agentConfigRegistry, agentRouter, sandboxManager } = await initAgents({ config, lifecycle });
|
||||||
|
|
||||||
// ── Model & Prompt ──
|
|
||||||
const audioConfig: AudioTranscriptionConfig = {
|
|
||||||
endpoint: config.audio.transcription_endpoint,
|
|
||||||
apiKey: config.audio.transcription_api_key,
|
|
||||||
model: config.audio.transcription_model,
|
|
||||||
};
|
|
||||||
const modelRouter = createModelRouter(config);
|
const modelRouter = createModelRouter(config);
|
||||||
|
|
||||||
// Restore persisted model tier
|
// Restore persisted model tier
|
||||||
@@ -133,7 +126,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
|||||||
|
|
||||||
const messageRouter = createMessageRouter({
|
const messageRouter = createMessageRouter({
|
||||||
sessionManager, modelRouter, systemPrompt, toolRegistry, toolExecutor,
|
sessionManager, modelRouter, systemPrompt, toolRegistry, toolExecutor,
|
||||||
config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager, audioConfig,
|
config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager,
|
||||||
});
|
});
|
||||||
channelRegistry.setMessageHandler(messageRouter.handler);
|
channelRegistry.setMessageHandler(messageRouter.handler);
|
||||||
channelAgents = messageRouter.agents;
|
channelAgents = messageRouter.agents;
|
||||||
|
|||||||
+48
-10
@@ -1,6 +1,7 @@
|
|||||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||||
import type { Attachment } from '../channels/types.js';
|
import type { Attachment } from '../channels/types.js';
|
||||||
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
||||||
|
import { supportsAudioInput } from '../models/capabilities.js';
|
||||||
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
|
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
|
||||||
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
||||||
import type { InboundMessage, OutboundMessage } from '../channels/index.js';
|
import type { InboundMessage, OutboundMessage } from '../channels/index.js';
|
||||||
@@ -32,7 +33,6 @@ export function createMessageRouter(deps: {
|
|||||||
agentConfigRegistry?: AgentConfigRegistry;
|
agentConfigRegistry?: AgentConfigRegistry;
|
||||||
agentRouter?: AgentRouter;
|
agentRouter?: AgentRouter;
|
||||||
sandboxManager?: SandboxManager;
|
sandboxManager?: SandboxManager;
|
||||||
audioConfig?: AudioTranscriptionConfig;
|
|
||||||
}): {
|
}): {
|
||||||
handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>) => Promise<void>;
|
handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>) => Promise<void>;
|
||||||
agents: Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>;
|
agents: Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>;
|
||||||
@@ -213,18 +213,56 @@ export function createMessageRouter(deps: {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Transcribe audio attachments before processing
|
// Determine if the active model supports native audio input
|
||||||
let messageText = msg.text;
|
let effectiveTier: string = deps.config.agents.primary_tier ?? 'default';
|
||||||
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
if (msg.metadata?.modelTier) {
|
||||||
|
effectiveTier = msg.metadata.modelTier as string;
|
||||||
if (audioAttachments.length > 0 && deps.audioConfig) {
|
} else if (deps.agentRouter && deps.agentConfigRegistry) {
|
||||||
for (const att of audioAttachments) {
|
const agentName = deps.agentRouter.resolve(msg.channel, msg.senderId);
|
||||||
const transcript = await transcribeAudio(att, deps.audioConfig);
|
if (agentName) {
|
||||||
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
const agentCfg = deps.agentConfigRegistry.get(agentName);
|
||||||
|
if (agentCfg?.modelTier) {
|
||||||
|
effectiveTier = agentCfg.modelTier;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const response = await agent.process(messageText, msg.attachments);
|
// Look up provider/model for the effective tier
|
||||||
|
const modelsConfig = deps.config.models as Record<string, { provider?: string; model?: string } | undefined>;
|
||||||
|
const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default;
|
||||||
|
const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider;
|
||||||
|
const modelName = tierConfig?.model ?? deps.config.models.default.model;
|
||||||
|
const supportsAudioOverride = (tierConfig as Record<string, unknown> | undefined)?.supports_audio as boolean | undefined;
|
||||||
|
const nativeAudioSupported = supportsAudioInput(modelProvider, modelName, supportsAudioOverride);
|
||||||
|
|
||||||
|
let messageText = msg.text;
|
||||||
|
let attachments = msg.attachments;
|
||||||
|
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
||||||
|
|
||||||
|
if (audioAttachments.length > 0 && !nativeAudioSupported) {
|
||||||
|
// Model doesn't support native audio — transcribe via Whisper and strip audio attachments
|
||||||
|
const audioConfig: AudioTranscriptionConfig | undefined = deps.config.audio?.enabled && deps.config.audio.provider
|
||||||
|
? {
|
||||||
|
endpoint: deps.config.audio.provider.endpoint,
|
||||||
|
apiKey: deps.config.audio.provider.api_key,
|
||||||
|
model: deps.config.audio.provider.model,
|
||||||
|
}
|
||||||
|
: undefined;
|
||||||
|
|
||||||
|
if (audioConfig?.endpoint) {
|
||||||
|
for (const att of audioAttachments) {
|
||||||
|
const transcript = await transcribeAudio(att, audioConfig);
|
||||||
|
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Remove audio attachments so buildUserMessage doesn't create audio content parts
|
||||||
|
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
|
||||||
|
if (attachments.length === 0) { attachments = undefined; }
|
||||||
|
}
|
||||||
|
// If native audio IS supported, we pass attachments through unchanged —
|
||||||
|
// buildUserMessage() in the agent will create native audio content parts
|
||||||
|
|
||||||
|
const response = await agent.process(messageText, attachments);
|
||||||
const outboundAttachments = collector.drain();
|
const outboundAttachments = collector.drain();
|
||||||
await reply({
|
await reply({
|
||||||
text: response,
|
text: response,
|
||||||
|
|||||||
+14
-1
@@ -1,7 +1,8 @@
|
|||||||
import type { Config } from '../config/index.js';
|
import type { Config } from '../config/index.js';
|
||||||
import type { Lifecycle } from './lifecycle.js';
|
import type { Lifecycle } from './lifecycle.js';
|
||||||
|
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||||
import { HookEngine } from '../hooks/index.js';
|
import { HookEngine } from '../hooks/index.js';
|
||||||
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
|
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createAudioTranscribeTool } from '../tools/index.js';
|
||||||
|
|
||||||
export interface ToolsDeps {
|
export interface ToolsDeps {
|
||||||
config: Config;
|
config: Config;
|
||||||
@@ -52,6 +53,18 @@ export function initTools(deps: ToolsDeps): ToolsResult {
|
|||||||
console.log('Process manager stopped');
|
console.log('Process manager stopped');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Register audio transcription tool if configured
|
||||||
|
if (config.audio?.enabled && config.audio.provider) {
|
||||||
|
const audioConfig: AudioTranscriptionConfig = {
|
||||||
|
endpoint: config.audio.provider.endpoint,
|
||||||
|
apiKey: config.audio.provider.api_key,
|
||||||
|
model: config.audio.provider.model,
|
||||||
|
};
|
||||||
|
const audioTool = createAudioTranscribeTool(audioConfig);
|
||||||
|
toolRegistry.register(audioTool);
|
||||||
|
console.log(`Audio transcription enabled (type=${config.audio.provider.type}, endpoint=${audioConfig.endpoint})`);
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize browser manager and register browser tools (if enabled)
|
// Initialize browser manager and register browser tools (if enabled)
|
||||||
let browserManager: BrowserManager | undefined;
|
let browserManager: BrowserManager | undefined;
|
||||||
if (config.browser?.enabled) {
|
if (config.browser?.enabled) {
|
||||||
|
|||||||
@@ -41,6 +41,13 @@ function toAnthropicContent(content: string | MessageContentPart[]): string | un
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
// Audio — Anthropic doesn't support native audio input; use transcript fallback
|
||||||
|
if (part.type === 'audio') {
|
||||||
|
if (part.source.transcript) {
|
||||||
|
return { type: 'text', text: `[Voice message]: ${part.source.transcript}` };
|
||||||
|
}
|
||||||
|
return { type: 'text', text: '[Audio message received but no transcript available]' };
|
||||||
|
}
|
||||||
return part;
|
return part;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
+20
-10
@@ -170,17 +170,27 @@ function convertMessages(messages: Message[]): BedrockMessage[] {
|
|||||||
if (part.type === 'text') {
|
if (part.type === 'text') {
|
||||||
return { text: part.text } as ContentBlock;
|
return { text: part.text } as ContentBlock;
|
||||||
}
|
}
|
||||||
// Image part — Bedrock uses { image: { format, source: { bytes } } }
|
if (part.type === 'image') {
|
||||||
if (part.source.type === 'base64' && part.source.data) {
|
// Image part — Bedrock uses { image: { format, source: { bytes } } }
|
||||||
return {
|
if (part.source.type === 'base64' && part.source.data) {
|
||||||
image: {
|
return {
|
||||||
format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
|
image: {
|
||||||
source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
|
format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
|
||||||
},
|
source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
|
||||||
} as unknown as ContentBlock;
|
},
|
||||||
|
} as unknown as ContentBlock;
|
||||||
|
}
|
||||||
|
// URL images not natively supported by Bedrock — fall back to text placeholder
|
||||||
|
return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
|
||||||
}
|
}
|
||||||
// URL images not natively supported by Bedrock — fall back to text placeholder
|
// Audio — Bedrock doesn't support native audio input; use transcript fallback
|
||||||
return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
|
if (part.type === 'audio') {
|
||||||
|
if (part.source.transcript) {
|
||||||
|
return { text: `[Voice message]: ${part.source.transcript}` } as ContentBlock;
|
||||||
|
}
|
||||||
|
return { text: '[Audio message received but no transcript available]' } as ContentBlock;
|
||||||
|
}
|
||||||
|
return { text: JSON.stringify(part) } as ContentBlock;
|
||||||
});
|
});
|
||||||
|
|
||||||
return { role, content: blocks };
|
return { role, content: blocks };
|
||||||
|
|||||||
@@ -0,0 +1,60 @@
|
|||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { supportsAudioInput } from './capabilities.js';
|
||||||
|
|
||||||
|
describe('supportsAudioInput', () => {
|
||||||
|
describe('audio-capable providers with modern models', () => {
|
||||||
|
it('returns true for gemini with a modern model', () => {
|
||||||
|
expect(supportsAudioInput('gemini', 'gemini-1.5-pro')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true for openai with a modern model', () => {
|
||||||
|
expect(supportsAudioInput('openai', 'gpt-4o')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true for github with a modern model', () => {
|
||||||
|
expect(supportsAudioInput('github', 'gpt-4o')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('non-audio providers return false', () => {
|
||||||
|
const nonAudioProviders = [
|
||||||
|
'anthropic',
|
||||||
|
'bedrock',
|
||||||
|
'ollama',
|
||||||
|
'llamacpp',
|
||||||
|
'openrouter',
|
||||||
|
'zhipuai',
|
||||||
|
'xai',
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
for (const provider of nonAudioProviders) {
|
||||||
|
it(`returns false for ${provider}`, () => {
|
||||||
|
expect(supportsAudioInput(provider, 'some-model')).toBe(false);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('model-specific exclusions', () => {
|
||||||
|
const excludedModels = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo'];
|
||||||
|
|
||||||
|
for (const model of excludedModels) {
|
||||||
|
it(`returns false for openai/${model} despite provider being capable`, () => {
|
||||||
|
expect(supportsAudioInput('openai', model)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it(`returns false for github/${model} despite provider being capable`, () => {
|
||||||
|
expect(supportsAudioInput('github', model)).toBe(false);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('unknown provider', () => {
|
||||||
|
it('returns false for a completely unknown provider', () => {
|
||||||
|
expect(supportsAudioInput('unknown-provider', 'some-model')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for an empty string provider', () => {
|
||||||
|
expect(supportsAudioInput('', 'some-model')).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,48 @@
|
|||||||
|
/**
|
||||||
|
* Model capability detection for native audio input support.
|
||||||
|
*
|
||||||
|
* Models that support native audio will receive raw audio data directly.
|
||||||
|
* Models that don't will receive a Whisper transcript as text instead.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export type ModelProvider = 'anthropic' | 'openai' | 'gemini' | 'bedrock' | 'github' | 'ollama' | 'llamacpp' | 'openrouter' | 'zhipuai' | 'xai';
|
||||||
|
|
||||||
|
/** Providers that support native audio input in their API. */
|
||||||
|
const AUDIO_CAPABLE_PROVIDERS = new Set<string>([
|
||||||
|
'gemini',
|
||||||
|
'openai',
|
||||||
|
'github', // GitHub Models uses OpenAI-compatible API
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models known NOT to support audio despite their provider supporting it.
|
||||||
|
* For example, older OpenAI models or specialized models.
|
||||||
|
*/
|
||||||
|
const AUDIO_INCAPABLE_MODELS = new Set<string>([
|
||||||
|
// Older OpenAI models that predate audio input support
|
||||||
|
'gpt-3.5-turbo',
|
||||||
|
'gpt-4',
|
||||||
|
'gpt-4-turbo',
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether a provider+model combination supports native audio input.
|
||||||
|
*
|
||||||
|
* Returns true if the model can receive raw audio data directly via its API,
|
||||||
|
* false if audio must be transcribed to text before sending.
|
||||||
|
*/
|
||||||
|
export function supportsAudioInput(provider: string, model: string, override?: boolean): boolean {
|
||||||
|
if (override !== undefined) return override;
|
||||||
|
|
||||||
|
// Provider must be in the capable set
|
||||||
|
if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check model-specific exclusions
|
||||||
|
if (AUDIO_INCAPABLE_MODELS.has(model)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
@@ -188,6 +188,15 @@ function convertMessages(messages: Message[]): Content[] {
|
|||||||
// so we pass as a text description. In production, you'd want to fetch + base64 encode.
|
// so we pass as a text description. In production, you'd want to fetch + base64 encode.
|
||||||
return { text: `[Image: ${part.source.url ?? 'unavailable'}]` };
|
return { text: `[Image: ${part.source.url ?? 'unavailable'}]` };
|
||||||
}
|
}
|
||||||
|
// Audio part — Gemini supports native audio via inlineData (same format as images)
|
||||||
|
if (part.type === 'audio') {
|
||||||
|
return {
|
||||||
|
inlineData: {
|
||||||
|
mimeType: part.source.media_type,
|
||||||
|
data: part.source.data,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
return { text: JSON.stringify(part) };
|
return { text: JSON.stringify(part) };
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,23 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA
|
|||||||
: part.source.url!;
|
: part.source.url!;
|
||||||
return { type: 'image_url', image_url: { url } };
|
return { type: 'image_url', image_url: { url } };
|
||||||
}
|
}
|
||||||
|
if (part.type === 'audio') {
|
||||||
|
// GitHub Models uses OpenAI-compatible API — native audio via input_audio
|
||||||
|
const formatMap: Record<string, string> = {
|
||||||
|
'audio/wav': 'wav',
|
||||||
|
'audio/mpeg': 'mp3',
|
||||||
|
'audio/mp3': 'mp3',
|
||||||
|
'audio/ogg': 'ogg',
|
||||||
|
'audio/webm': 'webm',
|
||||||
|
'audio/mp4': 'mp4',
|
||||||
|
'audio/x-m4a': 'mp4',
|
||||||
|
};
|
||||||
|
const format = formatMap[part.source.media_type] ?? 'wav';
|
||||||
|
return {
|
||||||
|
type: 'input_audio',
|
||||||
|
input_audio: { data: part.source.data, format },
|
||||||
|
} as unknown as OpenAI.ChatCompletionContentPart;
|
||||||
|
}
|
||||||
// Fallback — shouldn't happen
|
// Fallback — shouldn't happen
|
||||||
return { type: 'text', text: JSON.stringify(part) };
|
return { type: 'text', text: JSON.stringify(part) };
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -8,17 +8,23 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js';
|
|||||||
export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
|
export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
|
||||||
export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
|
export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
|
||||||
export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
|
export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
|
||||||
|
export { supportsAudioInput } from './capabilities.js';
|
||||||
export {
|
export {
|
||||||
isSupportedImage,
|
isSupportedImage,
|
||||||
|
isSupportedAudio,
|
||||||
attachmentToImageSource,
|
attachmentToImageSource,
|
||||||
|
attachmentToAudioSource,
|
||||||
buildUserMessage,
|
buildUserMessage,
|
||||||
getMessageText,
|
getMessageText,
|
||||||
hasImages,
|
hasImages,
|
||||||
|
hasAudio,
|
||||||
|
stripAudioParts,
|
||||||
} from './media.js';
|
} from './media.js';
|
||||||
export type {
|
export type {
|
||||||
Message,
|
Message,
|
||||||
MessageContentPart,
|
MessageContentPart,
|
||||||
ImageSource,
|
ImageSource,
|
||||||
|
AudioSource,
|
||||||
ChatRequest,
|
ChatRequest,
|
||||||
ChatResponse,
|
ChatResponse,
|
||||||
ChatStreamEvent,
|
ChatStreamEvent,
|
||||||
|
|||||||
@@ -6,11 +6,14 @@ import {
|
|||||||
isSupportedImage,
|
isSupportedImage,
|
||||||
isSupportedAudio,
|
isSupportedAudio,
|
||||||
attachmentToImageSource,
|
attachmentToImageSource,
|
||||||
|
attachmentToAudioSource,
|
||||||
buildUserMessage,
|
buildUserMessage,
|
||||||
getMessageText,
|
getMessageText,
|
||||||
getMessageTextWithTools,
|
getMessageTextWithTools,
|
||||||
normalizeMessagesForLocal,
|
normalizeMessagesForLocal,
|
||||||
hasImages,
|
hasImages,
|
||||||
|
hasAudio,
|
||||||
|
stripAudioParts,
|
||||||
transcribeAudio,
|
transcribeAudio,
|
||||||
buildUserMessageWithAudio,
|
buildUserMessageWithAudio,
|
||||||
type AudioTranscriptionConfig,
|
type AudioTranscriptionConfig,
|
||||||
@@ -820,3 +823,212 @@ describe('normalizeMessagesForLocal', () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 12. attachmentToAudioSource
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('attachmentToAudioSource', () => {
|
||||||
|
// Positive: supported audio type with data returns AudioSource.
|
||||||
|
it('returns AudioSource for supported audio type with data', () => {
|
||||||
|
const result = attachmentToAudioSource(oggAudioAttachment);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
media_type: 'audio/ogg',
|
||||||
|
data: 'AAAAAAAAAAAAAAAAAAAA',
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Negative: unsupported MIME type returns null.
|
||||||
|
it('returns null for unsupported mime type', () => {
|
||||||
|
const result = attachmentToAudioSource(pdfAttachment);
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Negative: supported audio type but no data returns null.
|
||||||
|
it('returns null when no data present', () => {
|
||||||
|
const noDataAudio = makeAttachment({
|
||||||
|
mimeType: 'audio/ogg',
|
||||||
|
filename: 'voice.ogg',
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = attachmentToAudioSource(noDataAudio);
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Negative: image attachment returns null.
|
||||||
|
it('returns null for image attachment', () => {
|
||||||
|
const result = attachmentToAudioSource(jpegBase64Attachment);
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 13. hasAudio
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('hasAudio', () => {
|
||||||
|
// Negative: string content never has audio.
|
||||||
|
it('returns false for string content messages', () => {
|
||||||
|
const msg: Message = { role: 'user', content: 'no audio here' };
|
||||||
|
|
||||||
|
expect(hasAudio(msg)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Negative: multimodal messages with only text parts have no audio.
|
||||||
|
it('returns false for multimodal messages with only text parts', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [{ type: 'text', text: 'just text' }],
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(hasAudio(msg)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Negative: multimodal messages with only image parts have no audio.
|
||||||
|
it('returns false for multimodal messages with only image parts', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(hasAudio(msg)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Positive: multimodal messages with audio parts are detected.
|
||||||
|
it('returns true for multimodal messages with audio parts', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(hasAudio(msg)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Positive: multimodal messages with mixed image + audio parts are detected.
|
||||||
|
it('returns true for multimodal messages with mixed image+audio parts', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'img' } },
|
||||||
|
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(hasAudio(msg)).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 14. stripAudioParts
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('stripAudioParts', () => {
|
||||||
|
// String content passes through unchanged.
|
||||||
|
it('returns unchanged message for string content', () => {
|
||||||
|
const msg: Message = { role: 'user', content: 'plain text' };
|
||||||
|
|
||||||
|
const result = stripAudioParts(msg);
|
||||||
|
|
||||||
|
expect(result).toEqual({ role: 'user', content: 'plain text' });
|
||||||
|
});
|
||||||
|
|
||||||
|
// Audio part with transcript is replaced with transcript text.
|
||||||
|
it('replaces audio part with transcript text when transcript is present', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'text', text: 'Check this out' },
|
||||||
|
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hello world' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = stripAudioParts(msg);
|
||||||
|
|
||||||
|
expect(result.role).toBe('user');
|
||||||
|
expect(Array.isArray(result.content)).toBe(true);
|
||||||
|
const parts = result.content as Array<{ type: string; text?: string }>;
|
||||||
|
expect(parts).toHaveLength(2);
|
||||||
|
expect(parts[0]).toEqual({ type: 'text', text: 'Check this out' });
|
||||||
|
expect(parts[1]).toEqual({ type: 'text', text: '[Voice message]: Hello world' });
|
||||||
|
});
|
||||||
|
|
||||||
|
// Audio part without transcript is replaced with placeholder.
|
||||||
|
it('replaces audio part with placeholder when no transcript', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'text', text: 'Listen' },
|
||||||
|
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = stripAudioParts(msg);
|
||||||
|
|
||||||
|
expect(Array.isArray(result.content)).toBe(true);
|
||||||
|
const parts = result.content as Array<{ type: string; text?: string }>;
|
||||||
|
expect(parts).toHaveLength(2);
|
||||||
|
expect(parts[0]).toEqual({ type: 'text', text: 'Listen' });
|
||||||
|
expect(parts[1]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
|
||||||
|
});
|
||||||
|
|
||||||
|
// Non-audio parts (text + image) are kept unchanged.
|
||||||
|
it('keeps non-audio parts unchanged', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'text', text: 'caption' },
|
||||||
|
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = stripAudioParts(msg);
|
||||||
|
|
||||||
|
expect(result.content).toEqual([
|
||||||
|
{ type: 'text', text: 'caption' },
|
||||||
|
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Simplifies to string content when only one text part remains after stripping.
|
||||||
|
it('simplifies to string content when only one text part remains after stripping', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hi there' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = stripAudioParts(msg);
|
||||||
|
|
||||||
|
expect(result).toEqual({ role: 'user', content: '[Voice message]: Hi there' });
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handles message with multiple audio parts.
|
||||||
|
it('handles message with multiple audio parts', () => {
|
||||||
|
const msg: Message = {
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'First message' } },
|
||||||
|
{ type: 'text', text: 'in between' },
|
||||||
|
{ type: 'audio', source: { media_type: 'audio/mpeg', data: 'BBBB' } },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = stripAudioParts(msg);
|
||||||
|
|
||||||
|
expect(Array.isArray(result.content)).toBe(true);
|
||||||
|
const parts = result.content as Array<{ type: string; text?: string }>;
|
||||||
|
expect(parts).toHaveLength(3);
|
||||||
|
expect(parts[0]).toEqual({ type: 'text', text: '[Voice message]: First message' });
|
||||||
|
expect(parts[1]).toEqual({ type: 'text', text: 'in between' });
|
||||||
|
expect(parts[2]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
+80
-11
@@ -3,7 +3,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import type { Attachment } from '../channels/types.js';
|
import type { Attachment } from '../channels/types.js';
|
||||||
import type { MessageContentPart, ImageSource, Message } from './types.js';
|
import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js';
|
||||||
|
|
||||||
/** MIME types that vision models generally accept. */
|
/** MIME types that vision models generally accept. */
|
||||||
const SUPPORTED_IMAGE_TYPES = new Set([
|
const SUPPORTED_IMAGE_TYPES = new Set([
|
||||||
@@ -73,34 +73,55 @@ export function attachmentToImageSource(attachment: Attachment): ImageSource | n
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */
|
||||||
|
export function attachmentToAudioSource(attachment: Attachment): AudioSource | null {
|
||||||
|
if (!isSupportedAudio(attachment)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!attachment.data) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
media_type: attachment.mimeType,
|
||||||
|
data: attachment.data,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build a multimodal Message from text + attachments.
|
* Build a multimodal Message from text + attachments.
|
||||||
* If there are no image attachments, returns a plain text Message.
|
* If there are no image or audio attachments, returns a plain text Message.
|
||||||
* If there are image attachments, returns a Message with structured content parts.
|
* If there are image/audio attachments, returns a Message with structured content parts.
|
||||||
*/
|
*/
|
||||||
export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
|
export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
|
||||||
const imageParts: MessageContentPart[] = [];
|
const mediaParts: MessageContentPart[] = [];
|
||||||
|
|
||||||
if (attachments) {
|
if (attachments) {
|
||||||
for (const att of attachments) {
|
for (const att of attachments) {
|
||||||
const source = attachmentToImageSource(att);
|
const imageSource = attachmentToImageSource(att);
|
||||||
if (source) {
|
if (imageSource) {
|
||||||
imageParts.push({ type: 'image', source });
|
mediaParts.push({ type: 'image', source: imageSource });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const audioSource = attachmentToAudioSource(att);
|
||||||
|
if (audioSource) {
|
||||||
|
mediaParts.push({ type: 'audio', source: audioSource });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// No images — return simple text message (preserves backward compat)
|
// No media — return simple text message (preserves backward compat)
|
||||||
if (imageParts.length === 0) {
|
if (mediaParts.length === 0) {
|
||||||
return { role: 'user', content: text };
|
return { role: 'user', content: text };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build multimodal content: text first, then images
|
// Build multimodal content: text first, then media
|
||||||
const parts: MessageContentPart[] = [];
|
const parts: MessageContentPart[] = [];
|
||||||
if (text) {
|
if (text) {
|
||||||
parts.push({ type: 'text', text });
|
parts.push({ type: 'text', text });
|
||||||
}
|
}
|
||||||
parts.push(...imageParts);
|
parts.push(...mediaParts);
|
||||||
|
|
||||||
return { role: 'user', content: parts };
|
return { role: 'user', content: parts };
|
||||||
}
|
}
|
||||||
@@ -148,6 +169,13 @@ export function getMessageTextWithTools(message: Message): string {
|
|||||||
const content = (block.content as string) ?? '';
|
const content = (block.content as string) ?? '';
|
||||||
const isError = block.is_error ? ' (error)' : '';
|
const isError = block.is_error ? ' (error)' : '';
|
||||||
parts.push(`[Tool result${isError}: ${content}]`);
|
parts.push(`[Tool result${isError}: ${content}]`);
|
||||||
|
} else if (block.type === 'audio') {
|
||||||
|
const source = block.source as Record<string, unknown>;
|
||||||
|
if (source?.transcript) {
|
||||||
|
parts.push(`[Voice message]: ${source.transcript}`);
|
||||||
|
} else {
|
||||||
|
parts.push('[Audio attachment]');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return parts.join('\n');
|
return parts.join('\n');
|
||||||
@@ -298,3 +326,44 @@ export function hasImages(message: Message): boolean {
|
|||||||
}
|
}
|
||||||
return message.content.some(p => p.type === 'image');
|
return message.content.some(p => p.type === 'image');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether a message contains audio content parts.
|
||||||
|
*/
|
||||||
|
export function hasAudio(message: Message): boolean {
|
||||||
|
if (typeof message.content === 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return message.content.some(p => p.type === 'audio');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Strip audio parts from a message, replacing them with their transcripts as text.
|
||||||
|
* Used for model providers that don't support native audio input (Anthropic, Bedrock, local).
|
||||||
|
*/
|
||||||
|
export function stripAudioParts(message: Message): Message {
|
||||||
|
if (typeof message.content === 'string') {
|
||||||
|
return message;
|
||||||
|
}
|
||||||
|
|
||||||
|
const newParts: MessageContentPart[] = [];
|
||||||
|
for (const part of message.content) {
|
||||||
|
if (part.type === 'audio') {
|
||||||
|
// Replace audio with transcript text if available
|
||||||
|
if (part.source.transcript) {
|
||||||
|
newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` });
|
||||||
|
} else {
|
||||||
|
newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
newParts.push(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If all that's left is a single text part, simplify to string content
|
||||||
|
if (newParts.length === 1 && newParts[0].type === 'text') {
|
||||||
|
return { ...message, content: newParts[0].text };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { ...message, content: newParts };
|
||||||
|
}
|
||||||
|
|||||||
@@ -28,6 +28,24 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA
|
|||||||
: part.source.url!;
|
: part.source.url!;
|
||||||
return { type: 'image_url', image_url: { url } };
|
return { type: 'image_url', image_url: { url } };
|
||||||
}
|
}
|
||||||
|
if (part.type === 'audio') {
|
||||||
|
// OpenAI native audio input via input_audio content part
|
||||||
|
// Determine format from MIME type (OpenAI supports: wav, mp3, flac, opus, ogg, webm)
|
||||||
|
const formatMap: Record<string, string> = {
|
||||||
|
'audio/wav': 'wav',
|
||||||
|
'audio/mpeg': 'mp3',
|
||||||
|
'audio/mp3': 'mp3',
|
||||||
|
'audio/ogg': 'ogg',
|
||||||
|
'audio/webm': 'webm',
|
||||||
|
'audio/mp4': 'mp4',
|
||||||
|
'audio/x-m4a': 'mp4',
|
||||||
|
};
|
||||||
|
const format = formatMap[part.source.media_type] ?? 'wav';
|
||||||
|
return {
|
||||||
|
type: 'input_audio',
|
||||||
|
input_audio: { data: part.source.data, format },
|
||||||
|
} as unknown as OpenAI.ChatCompletionContentPart;
|
||||||
|
}
|
||||||
// Fallback — shouldn't happen
|
// Fallback — shouldn't happen
|
||||||
return { type: 'text', text: JSON.stringify(part) };
|
return { type: 'text', text: JSON.stringify(part) };
|
||||||
});
|
});
|
||||||
|
|||||||
+13
-1
@@ -9,10 +9,21 @@ export interface ImageSource {
|
|||||||
url?: string;
|
url?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Audio source for multimodal content blocks. */
|
||||||
|
export interface AudioSource {
|
||||||
|
/** MIME type (e.g. "audio/ogg", "audio/mpeg", "audio/wav", "audio/webm"). */
|
||||||
|
media_type: string;
|
||||||
|
/** Base64-encoded audio data. */
|
||||||
|
data: string;
|
||||||
|
/** Optional transcript (from Whisper) — used when the model doesn't support native audio. */
|
||||||
|
transcript?: string;
|
||||||
|
}
|
||||||
|
|
||||||
/** Individual content part within a multimodal message. */
|
/** Individual content part within a multimodal message. */
|
||||||
export type MessageContentPart =
|
export type MessageContentPart =
|
||||||
| { type: 'text'; text: string }
|
| { type: 'text'; text: string }
|
||||||
| { type: 'image'; source: ImageSource };
|
| { type: 'image'; source: ImageSource }
|
||||||
|
| { type: 'audio'; source: AudioSource };
|
||||||
|
|
||||||
export interface Message {
|
export interface Message {
|
||||||
role: 'user' | 'assistant';
|
role: 'user' | 'assistant';
|
||||||
@@ -43,6 +54,7 @@ export interface ModelToolCall {
|
|||||||
export type ContentBlock =
|
export type ContentBlock =
|
||||||
| { type: 'text'; text: string }
|
| { type: 'text'; text: string }
|
||||||
| { type: 'image'; source: ImageSource }
|
| { type: 'image'; source: ImageSource }
|
||||||
|
| { type: 'audio'; source: AudioSource }
|
||||||
| { type: 'tool_use'; id: string; name: string; input: unknown };
|
| { type: 'tool_use'; id: string; name: string; input: unknown };
|
||||||
|
|
||||||
// Tool result fed back into conversation
|
// Tool result fed back into conversation
|
||||||
|
|||||||
@@ -0,0 +1,214 @@
|
|||||||
|
import type { Tool, ToolResult } from '../types.js';
|
||||||
|
|
||||||
|
interface AudioTranscribeArgs {
|
||||||
|
data?: string;
|
||||||
|
url?: string;
|
||||||
|
mime_type?: string;
|
||||||
|
language?: string;
|
||||||
|
prompt?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SUPPORTED_MIME_TYPES = new Set([
|
||||||
|
'audio/ogg',
|
||||||
|
'audio/mpeg',
|
||||||
|
'audio/mp3',
|
||||||
|
'audio/wav',
|
||||||
|
'audio/webm',
|
||||||
|
'audio/mp4',
|
||||||
|
'audio/x-m4a',
|
||||||
|
]);
|
||||||
|
|
||||||
|
const PROVIDER_ENDPOINTS: Record<string, string> = {
|
||||||
|
openai: 'https://api.openai.com/v1/audio/transcriptions',
|
||||||
|
groq: 'https://api.groq.com/openai/v1/audio/transcriptions',
|
||||||
|
ollama: 'http://localhost:11434/api/generate',
|
||||||
|
llamacpp: 'http://localhost:8080/v1/audio/transcriptions',
|
||||||
|
};
|
||||||
|
|
||||||
|
function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
|
||||||
|
const hasData = args.data !== undefined && args.data !== '';
|
||||||
|
const hasUrl = args.url !== undefined && args.url !== '';
|
||||||
|
|
||||||
|
if (!hasData && !hasUrl) {
|
||||||
|
return { valid: false, error: 'Either data or url must be provided' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasData && hasUrl) {
|
||||||
|
return { valid: false, error: 'Only one of data or url can be provided' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasData && !args.mime_type) {
|
||||||
|
return { valid: false, error: 'mime_type is required when using data' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) {
|
||||||
|
return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { valid: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AudioTranscriptionConfig {
|
||||||
|
endpoint?: string;
|
||||||
|
apiKey?: string;
|
||||||
|
model?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool {
|
||||||
|
return {
|
||||||
|
name: 'audio.transcribe',
|
||||||
|
description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.',
|
||||||
|
inputSchema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
data: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Base64-encoded audio data (alternative to url)',
|
||||||
|
},
|
||||||
|
url: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'URL to download to audio file (alternative to data)',
|
||||||
|
},
|
||||||
|
mime_type: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)',
|
||||||
|
},
|
||||||
|
language: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Language code (e.g., en, es, fr) - optional',
|
||||||
|
},
|
||||||
|
prompt: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Optional text to guide transcription (OpenAI/Groq/custom only)',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
||||||
|
const args = rawArgs as AudioTranscribeArgs;
|
||||||
|
|
||||||
|
const validation = validateInput(args);
|
||||||
|
if (!validation.valid) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
output: '',
|
||||||
|
error: validation.error,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!audioConfig?.endpoint) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
output: '',
|
||||||
|
error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
let filename = 'audio.bin';
|
||||||
|
let audioBlob: Blob | undefined;
|
||||||
|
|
||||||
|
if (args.data) {
|
||||||
|
const rawBuffer = Buffer.from(args.data, 'base64');
|
||||||
|
const audioBuffer = rawBuffer.buffer;
|
||||||
|
|
||||||
|
const extMap: Record<string, string> = {
|
||||||
|
'audio/ogg': 'ogg',
|
||||||
|
'audio/mpeg': 'mp3',
|
||||||
|
'audio/mp3': 'mp3',
|
||||||
|
'audio/wav': 'wav',
|
||||||
|
'audio/webm': 'webm',
|
||||||
|
'audio/mp4': 'm4a',
|
||||||
|
'audio/x-m4a': 'm4a',
|
||||||
|
};
|
||||||
|
const ext = extMap[args.mime_type!] || 'bin';
|
||||||
|
filename = `audio.${ext}`;
|
||||||
|
|
||||||
|
const mimeType = args.mime_type ?? 'audio/wav';
|
||||||
|
audioBlob = new Blob([audioBuffer], { type: mimeType });
|
||||||
|
} else if (args.url) {
|
||||||
|
const response = await fetch(args.url);
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`);
|
||||||
|
}
|
||||||
|
const arrayBuffer = await response.arrayBuffer();
|
||||||
|
|
||||||
|
const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin';
|
||||||
|
filename = `audio.${urlExt}`;
|
||||||
|
|
||||||
|
audioBlob = new Blob([arrayBuffer], { type: 'audio/wav' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const endpoint = audioConfig.endpoint;
|
||||||
|
const model = audioConfig.model ?? 'whisper-1';
|
||||||
|
|
||||||
|
if (endpoint === PROVIDER_ENDPOINTS.ollama) {
|
||||||
|
const ollamaResponse = await fetch(endpoint, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: model,
|
||||||
|
audio: args.data ?? args.url,
|
||||||
|
stream: false,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!ollamaResponse.ok) {
|
||||||
|
throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const ollamaJson = await ollamaResponse.json() as { response?: string };
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
output: ollamaJson.response ?? 'No response from Ollama',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('file', audioBlob, filename);
|
||||||
|
formData.append('model', model);
|
||||||
|
|
||||||
|
if (args.language) {
|
||||||
|
formData.append('language', args.language);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args.prompt) {
|
||||||
|
formData.append('prompt', args.prompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
const fetchOptions: RequestInit = {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData,
|
||||||
|
};
|
||||||
|
|
||||||
|
const headers: Record<string, string> = {};
|
||||||
|
if (audioConfig.apiKey) {
|
||||||
|
headers['Authorization'] = `Bearer ${audioConfig.apiKey}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Object.keys(headers).length > 0) {
|
||||||
|
fetchOptions.headers = headers;
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(endpoint, fetchOptions);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorText = await response.text();
|
||||||
|
throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const json = await response.json() as { text: string };
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
output: json.text,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
output: '',
|
||||||
|
error: error instanceof Error ? error.message : 'Unknown error occurred',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@ export { systemInfoTool } from './system-info.js';
|
|||||||
export { webFetchTool } from './web-fetch.js';
|
export { webFetchTool } from './web-fetch.js';
|
||||||
export { createMediaSendTool } from './media-send.js';
|
export { createMediaSendTool } from './media-send.js';
|
||||||
export { createImageAnalyzeTool } from './image-analyze.js';
|
export { createImageAnalyzeTool } from './image-analyze.js';
|
||||||
|
export { createAudioTranscribeTool } from './audio-transcribe.js';
|
||||||
export { createMemoryReadTool } from './memory-read.js';
|
export { createMemoryReadTool } from './memory-read.js';
|
||||||
export { createMemoryWriteTool } from './memory-write.js';
|
export { createMemoryWriteTool } from './memory-write.js';
|
||||||
export { createMemorySearchTool } from './memory-search.js';
|
export { createMemorySearchTool } from './memory-search.js';
|
||||||
|
|||||||
+1
-1
@@ -5,7 +5,7 @@ export { ToolExecutor } from './executor.js';
|
|||||||
export type { ToolExecutorConfig } from './executor.js';
|
export type { ToolExecutorConfig } from './executor.js';
|
||||||
export { ToolPolicy } from './policy.js';
|
export { ToolPolicy } from './policy.js';
|
||||||
export type { ToolPolicyContext } from './policy.js';
|
export type { ToolPolicyContext } from './policy.js';
|
||||||
export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js';
|
export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createAudioTranscribeTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js';
|
||||||
export type { WebSearchConfig } from './builtin/web-search.js';
|
export type { WebSearchConfig } from './builtin/web-search.js';
|
||||||
export type { ProcessManagerConfig } from './builtin/process/index.js';
|
export type { ProcessManagerConfig } from './builtin/process/index.js';
|
||||||
export type { BrowserManagerConfig } from './builtin/browser/index.js';
|
export type { BrowserManagerConfig } from './builtin/browser/index.js';
|
||||||
|
|||||||
Reference in New Issue
Block a user