diff --git a/README.md b/README.md index 85d4040..fef4d7c 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ Self-hosted personal AI assistant with Telegram and Terminal interfaces. - **Docker Sandboxing**: Per-session container isolation for tool execution - **Multi-Agent Routing**: Config-driven agent selection per sender/channel with tool profiles - **Media Pipeline**: Image analysis, outbound attachments, audio transcription and native audio passthrough across all channels +- **Talk Mode (Wake Phrase)**: Optional wake-phrase gating (`audio.talk_mode`) with timed conversation windows +- **Capture Tools**: `screen.capture` and `camera.capture` tools for host capture workflows - **Session Transfer**: Move conversations between frontends - **CLI**: Full command-line interface (`flynn start`, `send`, `doctor`, `completion`, etc.) - **Shell Completion**: Auto-generated completions for bash, zsh, and fish with `--install` flag @@ -294,6 +296,10 @@ audio: | `provider.endpoint` | yes | Whisper-compatible API endpoint | | `provider.api_key` | no | Bearer token for authentication | | `provider.model` | no | Model name sent in request (default: `whisper-1`) | +| `talk_mode.enabled` | no | Enable wake-phrase talk mode gating (default: `false`) | +| `talk_mode.wake_phrase` | no | Phrase that activates talk mode (default: `hey flynn`) | +| `talk_mode.timeout_ms` | no | Active listen window after wake (default: `120000`) | +| `talk_mode.allow_manual_toggle` | no | Enable `/talk on|off|status` controls (default: `true`) | Without an `audio` config, voice messages from non-audio-capable models will display an error message to the user. For local transcription, you can run a whisper.cpp server: @@ -314,6 +320,17 @@ docker run -d \ # docker compose up -d ``` +### Capture Tools + +Flynn includes host capture tools: +- `screen.capture` -> captures current screen and returns base64 image payload +- `camera.capture` -> captures one camera frame and returns base64 image payload + +Notes: +- These are host-command wrappers and require platform binaries: + - macOS: `screencapture` (screen), `imagesnap` (camera) + - Linux: `grim` or ImageMagick `import` (screen), `ffmpeg` (camera) + ## Telegram Commands | Command | Description | diff --git a/config/default.yaml b/config/default.yaml index 629d0ee..478d814 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -248,3 +248,8 @@ hooks: # endpoint: "http://localhost:18801/v1/audio/transcriptions" # api_key: "${WHISPER_API_KEY}" # Optional Bearer token # model: "whisper-1" # Model name (default: whisper-1) +# talk_mode: +# enabled: false +# wake_phrase: "hey flynn" +# timeout_ms: 120000 +# allow_manual_toggle: true diff --git a/src/config/schema.test.ts b/src/config/schema.test.ts index 2ef5c56..bf384cf 100644 --- a/src/config/schema.test.ts +++ b/src/config/schema.test.ts @@ -350,6 +350,39 @@ describe('configSchema — signal', () => { }); }); +describe('configSchema — audio talk mode', () => { + const minimalConfig = { + telegram: { bot_token: 'test', allowed_chat_ids: [1] }, + models: { default: { provider: 'anthropic', model: 'claude-3' } }, + }; + + it('defaults talk_mode fields', () => { + const result = configSchema.parse(minimalConfig); + expect(result.audio.talk_mode.enabled).toBe(false); + expect(result.audio.talk_mode.wake_phrase).toBe('hey flynn'); + expect(result.audio.talk_mode.timeout_ms).toBe(120000); + expect(result.audio.talk_mode.allow_manual_toggle).toBe(true); + }); + + it('accepts custom talk_mode settings', () => { + const result = configSchema.parse({ + ...minimalConfig, + audio: { + talk_mode: { + enabled: true, + wake_phrase: 'ok flynn', + timeout_ms: 300000, + allow_manual_toggle: false, + }, + }, + }); + expect(result.audio.talk_mode.enabled).toBe(true); + expect(result.audio.talk_mode.wake_phrase).toBe('ok flynn'); + expect(result.audio.talk_mode.timeout_ms).toBe(300000); + expect(result.audio.talk_mode.allow_manual_toggle).toBe(false); + }); +}); + describe('configSchema — teams', () => { const minimalConfig = { telegram: { bot_token: 'test', allowed_chat_ids: [1] }, diff --git a/src/config/schema.ts b/src/config/schema.ts index b5cb7e1..8844eba 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -472,9 +472,17 @@ const audioProviderSchema = z.object({ model: z.string().optional(), }); +const talkModeSchema = z.object({ + enabled: z.boolean().default(false), + wake_phrase: z.string().default('hey flynn'), + timeout_ms: z.number().min(1000).max(60 * 60 * 1000).default(120000), + allow_manual_toggle: z.boolean().default(true), +}).default({}); + const audioSchema = z.object({ enabled: z.boolean().default(false), provider: audioProviderSchema.optional(), + talk_mode: talkModeSchema, }).default({}); // ── Tool policy schemas ────────────────────────────────────────────── diff --git a/src/daemon/routing.test.ts b/src/daemon/routing.test.ts index a727ebf..6aff2a3 100644 --- a/src/daemon/routing.test.ts +++ b/src/daemon/routing.test.ts @@ -544,3 +544,67 @@ describe('daemon audio routing integration', () => { expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true); }); }); + +describe('daemon talk mode (voice wake) integration', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('ignores messages until wake phrase is used', async () => { + const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok'); + const session = { + id: 'telegram:user-talk-1', + addMessage: vi.fn(), + getHistory: vi.fn(() => []), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn(() => undefined), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const router = createMessageRouter({ + sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'], + modelRouter: { + getAvailableTiers: () => ['fast', 'default', 'complex', 'local'], + getAllLabels: () => ({ fast: 'fast', default: 'default', complex: 'complex', local: 'local' }), + getLabel: (tier: string) => tier, + } as unknown as MessageRouterDeps['modelRouter'], + systemPrompt: 'test prompt', + toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'], + toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'], + config: { + agents: { + primary_tier: 'default', + delegation: { compaction: 'fast', memory_extraction: 'fast', classification: 'fast', tool_summarisation: 'fast', complex_reasoning: 'complex' }, + max_delegation_depth: 3, + max_iterations: 10, + }, + compaction: { enabled: false }, + models: { default: { provider: 'anthropic', model: 'claude' } }, + audio: { talk_mode: { enabled: true, wake_phrase: 'hey flynn', timeout_ms: 120000, allow_manual_toggle: true } }, + } as unknown as MessageRouterDeps['config'], + }); + + const reply = vi.fn(async (_message: OutboundMessage) => {}); + + await router.handler({ + id: 'm-talk-1', + channel: 'telegram', + senderId: 'user-talk-1', + text: 'hello there', + timestamp: Date.now(), + } as MessageRouterInput, reply); + expect(processSpy).not.toHaveBeenCalled(); + + await router.handler({ + id: 'm-talk-2', + channel: 'telegram', + senderId: 'user-talk-1', + text: 'hey flynn what time is it?', + timestamp: Date.now(), + } as MessageRouterInput, reply); + expect(processSpy).toHaveBeenCalledOnce(); + expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined); + }); +}); diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index 5da35fb..cfef711 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -70,6 +70,7 @@ export function createMessageRouter(deps: { } { // Cache agents by session ID + agent config name to avoid recreating on every message const agents = new Map(); + const talkModeUntil = new Map(); function getOrCreateAgent(channel: string, senderId: string, metadata?: Record, agentOverride?: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } { // Resolve agent config name via routing (sender → channel → default fallback) @@ -246,10 +247,60 @@ export function createMessageRouter(deps: { } const handler = async (msg: InboundMessage, reply: (response: OutboundMessage) => Promise): Promise => { + let incomingText = msg.text; + const talkMode = deps.config.audio?.talk_mode; + if (talkMode?.enabled && incomingText.trim().length > 0) { + const key = `${msg.channel}:${msg.senderId}`; + const now = Date.now(); + const timeoutMs = talkMode.timeout_ms; + const currentUntil = talkModeUntil.get(key) ?? 0; + const lower = incomingText.trim().toLowerCase(); + + if (talkMode.allow_manual_toggle) { + if (lower === '/talk on') { + talkModeUntil.set(key, now + timeoutMs); + await reply({ text: `Talk mode enabled for ${Math.ceil(timeoutMs / 1000)}s.`, replyTo: msg.id }); + return; + } + if (lower === '/talk off') { + talkModeUntil.delete(key); + await reply({ text: 'Talk mode disabled.', replyTo: msg.id }); + return; + } + if (lower === '/talk status') { + if (currentUntil <= now) { + await reply({ text: 'Talk mode is idle (wake phrase required).', replyTo: msg.id }); + } else { + await reply({ text: `Talk mode active for ${Math.ceil((currentUntil - now) / 1000)}s.`, replyTo: msg.id }); + } + return; + } + } + + const phrase = talkMode.wake_phrase.trim(); + const wakeRegex = phrase + ? new RegExp(`^\\s*${escapeRegex(phrase)}(?:[\\s,:!.-]+)?`, 'i') + : null; + const wakeMatched = Boolean(wakeRegex && wakeRegex.test(incomingText)); + + if (wakeMatched && wakeRegex) { + talkModeUntil.set(key, now + timeoutMs); + incomingText = incomingText.replace(wakeRegex, '').trim(); + if (!incomingText) { + await reply({ text: `Listening. Talk mode active for ${Math.ceil(timeoutMs / 1000)}s.`, replyTo: msg.id }); + return; + } + } else if (currentUntil > now) { + talkModeUntil.set(key, now + timeoutMs); + } else { + return; + } + } + let intentAgentOverride: string | undefined; let intentSkillOverride: string | undefined; if (deps.config.intents?.enabled && deps.intentRegistry) { - const intentMatch = deps.intentRegistry.match(msg.text); + const intentMatch = deps.intentRegistry.match(incomingText); if (intentMatch?.rule.target.type === 'agent') { let confidence = intentMatch.score; @@ -298,7 +349,7 @@ export function createMessageRouter(deps: { const commandInput = msg.metadata?.isCommand && typeof msg.metadata.command === 'string' ? `/${msg.metadata.command}${msg.metadata.commandArgs ? ` ${msg.metadata.commandArgs}` : ''}` - : msg.text; + : incomingText; if (deps.commandRegistry && deps.commandRegistry.isCommand(commandInput)) { const session = deps.sessionManager.getSession(msg.channel, msg.senderId); @@ -604,7 +655,7 @@ export function createMessageRouter(deps: { const supportsAudioOverride = (tierConfig as Record | undefined)?.supports_audio as boolean | undefined; const nativeAudioSupported = supportsAudioInput(modelProvider, modelName, supportsAudioOverride); - let messageText = msg.text; + let messageText = incomingText; let attachments = msg.attachments; const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); @@ -666,3 +717,7 @@ export function createMessageRouter(deps: { return { handler, agents }; } + +function escapeRegex(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} diff --git a/src/gateway/handlers/services.test.ts b/src/gateway/handlers/services.test.ts index 934399b..7e9ebec 100644 --- a/src/gateway/handlers/services.test.ts +++ b/src/gateway/handlers/services.test.ts @@ -110,6 +110,12 @@ describe('discoverServices', () => { endpoint: 'http://localhost:18801/v1/audio/transcriptions', model: 'whisper-1', }, + talk_mode: { + enabled: false, + wake_phrase: 'hey flynn', + timeout_ms: 120000, + allow_manual_toggle: true, + }, }; const reg = new ChannelRegistry(); diff --git a/src/tools/builtin/capture.test.ts b/src/tools/builtin/capture.test.ts new file mode 100644 index 0000000..05776d8 --- /dev/null +++ b/src/tools/builtin/capture.test.ts @@ -0,0 +1,68 @@ +import { describe, it, expect, vi, beforeAll, beforeEach } from 'vitest'; +import { execFile } from 'child_process'; +import { readFile, unlink } from 'fs/promises'; +import type { ChildProcess } from 'child_process'; + +vi.mock('child_process', () => ({ + execFile: vi.fn(), +})); + +vi.mock('fs/promises', () => ({ + readFile: vi.fn(), + unlink: vi.fn(), +})); + +const mockExecFile = vi.mocked(execFile); +const mockReadFile = vi.mocked(readFile); +const mockUnlink = vi.mocked(unlink); +type ExecFileCallback = NonNullable[3]>; + +function mockChildProcess(): ChildProcess { + return {} as ChildProcess; +} + +function mockExecFileOnce(impl: (callback: ExecFileCallback) => void): void { + mockExecFile.mockImplementationOnce((_cmd, _args, _opts, callback) => { + if (typeof callback === 'function') { + impl(callback as ExecFileCallback); + } + return mockChildProcess(); + }); +} + +describe('capture tools', () => { + let screenCaptureTool: typeof import('./capture.js').screenCaptureTool; + let cameraCaptureTool: typeof import('./capture.js').cameraCaptureTool; + + beforeAll(async () => { + const mod = await import('./capture.js'); + screenCaptureTool = mod.screenCaptureTool; + cameraCaptureTool = mod.cameraCaptureTool; + }); + + beforeEach(() => { + vi.clearAllMocks(); + mockReadFile.mockResolvedValue(Buffer.from('image-bytes')); + mockUnlink.mockResolvedValue(undefined); + }); + + it('screen.capture returns base64 payload when command succeeds', async () => { + const platformSpy = vi.spyOn(process, 'platform', 'get').mockReturnValue('linux'); + mockExecFileOnce((callback) => callback(null, '', '')); + + const result = await screenCaptureTool.execute({ format: 'png' }); + expect(result.success).toBe(true); + expect(result.output).toContain('"mimeType":"image/png"'); + expect(mockExecFile).toHaveBeenCalled(); + + platformSpy.mockRestore(); + }); + + it('camera.capture returns error on unsupported platform', async () => { + const platformSpy = vi.spyOn(process, 'platform', 'get').mockReturnValue('win32'); + const result = await cameraCaptureTool.execute({}); + expect(result.success).toBe(false); + expect(result.error).toContain('only supported'); + platformSpy.mockRestore(); + }); +}); diff --git a/src/tools/builtin/capture.ts b/src/tools/builtin/capture.ts new file mode 100644 index 0000000..218f421 --- /dev/null +++ b/src/tools/builtin/capture.ts @@ -0,0 +1,124 @@ +import { execFile } from 'child_process'; +import { randomUUID } from 'crypto'; +import { readFile, unlink } from 'fs/promises'; +import { join } from 'path'; +import { tmpdir } from 'os'; + +import type { Tool, ToolResult } from '../types.js'; + +interface CaptureArgs { + format?: 'png' | 'jpg'; +} + +interface CameraCaptureArgs extends CaptureArgs { + device?: string; +} + +const DEFAULT_TIMEOUT_MS = 15000; + +function tempPath(format: string): string { + return join(tmpdir(), `flynn-capture-${randomUUID()}.${format}`); +} + +function exec(command: string, args: string[], timeoutMs = DEFAULT_TIMEOUT_MS): Promise { + return new Promise((resolve, reject) => { + execFile(command, args, { timeout: timeoutMs }, (error, _stdout, stderr) => { + if (error) { + reject(new Error(`${command} ${args.join(' ')} failed: ${stderr || error.message}`)); + return; + } + resolve(); + }); + }); +} + +async function readBase64AndCleanup(path: string): Promise { + try { + const data = await readFile(path); + return data.toString('base64'); + } finally { + await unlink(path).catch(() => {}); + } +} + +export const screenCaptureTool: Tool = { + name: 'screen.capture', + description: 'Capture the current screen and return a base64-encoded image (png/jpg).', + inputSchema: { + type: 'object', + properties: { + format: { type: 'string', description: 'Image format: png (default) or jpg' }, + }, + }, + execute: async (rawArgs: unknown): Promise => { + try { + const args = (rawArgs as CaptureArgs | undefined) ?? {}; + const format = args.format === 'jpg' ? 'jpg' : 'png'; + const out = tempPath(format); + + if (process.platform === 'darwin') { + await exec('screencapture', ['-x', '-t', format === 'jpg' ? 'jpg' : 'png', out]); + } else if (process.platform === 'linux') { + try { + await exec('grim', [out]); + } catch { + await exec('import', ['-window', 'root', out]); + } + } else { + return { success: false, output: '', error: 'screen.capture is only supported on macOS/Linux hosts' }; + } + + const data = await readBase64AndCleanup(out); + return { + success: true, + output: JSON.stringify({ mimeType: format === 'jpg' ? 'image/jpeg' : 'image/png', data }), + }; + } catch (error) { + return { + success: false, + output: '', + error: error instanceof Error ? error.message : String(error), + }; + } + }, +}; + +export const cameraCaptureTool: Tool = { + name: 'camera.capture', + description: 'Capture a single frame from the default camera and return a base64-encoded image.', + inputSchema: { + type: 'object', + properties: { + format: { type: 'string', description: 'Image format: png (default) or jpg' }, + device: { type: 'string', description: 'Optional camera device identifier (platform-specific)' }, + }, + }, + execute: async (rawArgs: unknown): Promise => { + try { + const args = (rawArgs as CameraCaptureArgs | undefined) ?? {}; + const format = args.format === 'jpg' ? 'jpg' : 'png'; + const out = tempPath(format); + + if (process.platform === 'darwin') { + await exec('imagesnap', ['-q', '-w', '1', out]); + } else if (process.platform === 'linux') { + const device = args.device ?? '/dev/video0'; + await exec('ffmpeg', ['-y', '-f', 'video4linux2', '-i', device, '-frames:v', '1', out]); + } else { + return { success: false, output: '', error: 'camera.capture is only supported on macOS/Linux hosts' }; + } + + const data = await readBase64AndCleanup(out); + return { + success: true, + output: JSON.stringify({ mimeType: format === 'jpg' ? 'image/jpeg' : 'image/png', data }), + }; + } catch (error) { + return { + success: false, + output: '', + error: error instanceof Error ? error.message : String(error), + }; + } + }, +}; diff --git a/src/tools/builtin/index.ts b/src/tools/builtin/index.ts index 7f9cbcf..2effc05 100644 --- a/src/tools/builtin/index.ts +++ b/src/tools/builtin/index.ts @@ -27,6 +27,7 @@ export { createGcalTools } from './gcal.js'; export { createGdocsTools } from './gdocs.js'; export { createGdriveTools } from './gdrive.js'; export { createGtasksTools } from './gtasks.js'; +export { screenCaptureTool, cameraCaptureTool } from './capture.js'; import type { Tool } from '../types.js'; import type { MemoryStore } from '../../memory/store.js'; @@ -40,6 +41,7 @@ import { filePatchTool } from './file-patch.js'; import { fileListTool } from './file-list.js'; import { systemInfoTool } from './system-info.js'; import { webFetchTool } from './web-fetch.js'; +import { screenCaptureTool, cameraCaptureTool } from './capture.js'; import { createMemoryReadTool } from './memory-read.js'; import { createMemoryWriteTool } from './memory-write.js'; import { createMemorySearchTool } from './memory-search.js'; @@ -55,6 +57,8 @@ export const allBuiltinTools: Tool[] = [ fileListTool, systemInfoTool, webFetchTool, + screenCaptureTool, + cameraCaptureTool, ]; /** Create memory tools that require a MemoryStore instance. */ diff --git a/src/tools/index.ts b/src/tools/index.ts index ff53526..a8fbb65 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -16,3 +16,4 @@ export { fileEditTool } from './builtin/file-edit.js'; export { fileListTool } from './builtin/file-list.js'; export { systemInfoTool } from './builtin/system-info.js'; export { webFetchTool } from './builtin/web-fetch.js'; +export { screenCaptureTool, cameraCaptureTool } from './builtin/capture.js'; diff --git a/src/tools/policy.ts b/src/tools/policy.ts index a74777b..2f5f471 100644 --- a/src/tools/policy.ts +++ b/src/tools/policy.ts @@ -76,6 +76,8 @@ const PROFILE_TOOLS: Record> = { 'process.output', 'process.kill', 'process.list', + 'screen.capture', + 'camera.capture', 'browser.navigate', 'browser.screenshot', 'browser.click', @@ -91,7 +93,7 @@ const PROFILE_TOOLS: Record> = { /** Named groups for use in allow/deny lists (e.g. 'group:fs'). */ export const TOOL_GROUPS: Record = { 'group:fs': ['file.read', 'file.write', 'file.edit', 'file.patch', 'file.list'], - 'group:runtime': ['shell.exec', 'process.start', 'process.output', 'process.status', 'process.kill', 'process.list'], + 'group:runtime': ['shell.exec', 'process.start', 'process.output', 'process.status', 'process.kill', 'process.list', 'screen.capture', 'camera.capture'], 'group:web': ['web.fetch', 'web.search', 'browser.navigate', 'browser.screenshot', 'browser.click', 'browser.type', 'browser.content', 'browser.eval'], 'group:memory': ['memory.read', 'memory.write', 'memory.search'], 'group:gmail': ['gmail.list', 'gmail.search', 'gmail.read'],