From e9873ad22b6e29f3ed385dc6c5c125e000c658d0 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Thu, 26 Feb 2026 14:06:53 -0800 Subject: [PATCH] Document browser reliability layer and roadmap progress --- README.md | 13 +++++++ docs/api/PROTOCOL.md | 7 ++++ docs/api/TOOLS.md | 4 +-- docs/architecture/AGENT_DIAGRAM.md | 1 + .../GATEWAY_SESSIONS_AND_QUEUE.md | 1 + docs/plans/state.json | 34 +++++++++++++++---- 6 files changed, 52 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 254ce2e..91bbfdc 100644 --- a/README.md +++ b/README.md @@ -951,12 +951,18 @@ Flynn ships these browser tools: - `browser.click` - `browser.type` - `browser.content` +- `browser.wait_for` +- `browser.assert` +- `browser.extract` +- `browser.checkpoint.save` +- `browser.checkpoint.resume` - `browser.eval` - `browser.evaluate` (alias of `browser.eval`) These tools are backed by a Puppeteer/CDP browser manager and are only registered when `browser.enabled: true`. They can still be filtered out by tool policy (`tools.profile`, `tools.allow`, `tools.deny`). At startup, Flynn logs the browser tools that remain available after policy filtering. +Browser runtime guardrails support domain allowlists, explicit high-risk-domain confirmation, retry controls, and a bounded workflow step budget. ```yaml browser: @@ -964,6 +970,13 @@ browser: headless: true max_pages: 5 default_timeout: 30000 + allowed_domains: ["*.example.com"] + high_risk_domains: ["bank.example.com"] + require_confirmation_for_high_risk: true + max_workflow_steps: 120 + default_retry_attempts: 1 + max_retry_attempts: 5 + retry_delay_ms: 250 # executable_path: /usr/bin/google-chrome # ws_endpoint: ws://127.0.0.1:9222/devtools/browser/ diff --git a/docs/api/PROTOCOL.md b/docs/api/PROTOCOL.md index b01cf20..0329397 100644 --- a/docs/api/PROTOCOL.md +++ b/docs/api/PROTOCOL.md @@ -1302,6 +1302,10 @@ Set callback for tool use events (for confirmation UI). List available tools. +When browser automation is enabled, `tools.list` may include workflow-reliability helpers such as: +`browser.wait_for`, `browser.assert`, `browser.extract`, `browser.checkpoint.save`, and `browser.checkpoint.resume` +in addition to baseline navigation/click/type/content/eval tools. + **Request:** ```json { @@ -1338,6 +1342,9 @@ List available tools. Execute a tool directly (bypass agent). +Browser workflow tools enforce runtime guardrails configured in `browser.*`: +domain allowlists, high-risk-domain confirmation (`confirm_high_risk=true`), retry bounds, and step-budget limits. + **Request:** ```json { diff --git a/docs/api/TOOLS.md b/docs/api/TOOLS.md index 11b5da2..7de429d 100644 --- a/docs/api/TOOLS.md +++ b/docs/api/TOOLS.md @@ -25,7 +25,7 @@ Tools are executable capabilities that the AI agent can call to perform actions - **File System**: `file.read`, `file.write`, `file.edit`, `file.list` - **Shell/Process**: `shell.exec`, `process.start`, `process.kill` - **Web**: `web.fetch`, `web.search` -- **Browser**: `browser.navigate`, `browser.screenshot`, `browser.click`, `browser.type`, `browser.content`, `browser.eval`, `browser.evaluate` (alias of `browser.eval`) +- **Browser**: `browser.navigate`, `browser.screenshot`, `browser.click`, `browser.type`, `browser.content`, `browser.wait_for`, `browser.assert`, `browser.extract`, `browser.checkpoint.save`, `browser.checkpoint.resume`, `browser.eval`, `browser.evaluate` (alias of `browser.eval`) - **Memory**: `memory.read`, `memory.write`, `memory.search` - **MinIO**: `minio.share`, `minio.ingest`, `minio.sync` - **Kubernetes**: `k8s.pods`, `k8s.deployments`, `k8s.logs` @@ -330,7 +330,7 @@ Use for tools that share a common dependency or manager. import type { Tool, ToolResult } from '../../types.js'; import type { BrowserManager } from './manager.js'; -export function createBrowserTools(manager: BrowserManager): Tool[] { +export function createBrowserTools(manager: BrowserManager, options?: BrowserToolsOptions): Tool[] { return [ { name: 'browser.navigate', diff --git a/docs/architecture/AGENT_DIAGRAM.md b/docs/architecture/AGENT_DIAGRAM.md index 6f241bd..51b5bfc 100644 --- a/docs/architecture/AGENT_DIAGRAM.md +++ b/docs/architecture/AGENT_DIAGRAM.md @@ -266,6 +266,7 @@ Flynn treats content provenance as part of the control boundary: - `web.fetch`, `web.search`, and `browser.content` outputs are treated as untrusted "fetched_content". - Tool results are wrapped in provenance markers inside the tool loop. - Once untrusted content is seen, ToolExecutor applies stricter gating (blocks obvious injection patterns for high-risk tools). +- Browser workflow tools add execution guardrails in the tool layer: `allowed_domains`, explicit high-risk confirmations, bounded retry policies, and step-budget enforcement. Key files: diff --git a/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md b/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md index ce9d1be..d8662ae 100644 --- a/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md +++ b/docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md @@ -18,6 +18,7 @@ If you only want the protocol surface, see `docs/api/PROTOCOL.md`. - Run lifecycle/cancel intent and reaction decisions are emitted to audit logs, and aggregated into `system.metrics` counters (runStates, cancelLatencyMs, reactions) for dashboards. - Reaction matching is deterministic (priority + cooldown + recursion guard) before intent/agent routing. - `subagent.*` tools create child orchestrators scoped to the parent conversation (`subagent::`) with idle TTL cleanup, per-child queue mode (`followup|interrupt`), and session budgets (turn/token/timeout); this is tool-loop behavior, not a separate gateway RPC session lane. +- Browser workflow reliability primitives (`browser.wait_for/assert/extract/checkpoint.*`) execute in the same queued session lane and apply browser-config guardrails (domain allowlist/high-risk confirmation, bounded retries, workflow step budget). - Companion `node.*` registration is per WebSocket connection; reconnects must re-register capabilities before invoking node RPC methods. - Canvas artifacts are persisted per session under the gateway data directory for UI recovery across restarts. - TTS output is best-effort; synthesis failures fall back to text-only responses. diff --git a/docs/plans/state.json b/docs/plans/state.json index 9217b27..657ef10 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -6786,15 +6786,37 @@ "test_status": "docs only" }, "personal-assistant-productization-plan-2026-02-26": { - "status": "proposed", + "status": "in_progress", "date": "2026-02-26", "updated": "2026-02-26", - "summary": "Rebaselined Flynn's OpenClaw-style personal-assistant gaps and defined an execution-ready 8-10 week productization roadmap focused on shipped companion apps, voice daily-driver reliability, browser workflow reliability, and onboarding first-success funnel metrics.", + "summary": "Rebaselined Flynn's OpenClaw-style personal-assistant gaps and defined an execution-ready 8-10 week roadmap. Phase 3 browser reliability work is now shipped (workflow primitives, retry/budget/guardrails, checkpoints), with companion/voice/onboarding phases remaining.", "files_modified": [ "docs/plans/2026-02-26-personal-assistant-productization-plan.md", "docs/plans/state.json" ], - "test_status": "planning/docs update only; no runtime code changes" + "test_status": "roadmap status updated; implementation tracked in phase-specific entries" + }, + "personal-assistant-productization-phase3-browser-reliability": { + "status": "completed", + "date": "2026-02-26", + "updated": "2026-02-26", + "summary": "Implemented Phase 3 browser workflow reliability layer: added `browser.wait_for`, `browser.assert`, `browser.extract`, checkpoint save/resume tools, retry wrappers, domain allowlist + high-risk confirmation guardrails, and bounded workflow-step budgets wired through config and daemon registration.", + "files_modified": [ + "src/tools/builtin/browser/tools.ts", + "src/tools/builtin/browser/tools.test.ts", + "src/daemon/tools.ts", + "src/tools/policy.ts", + "src/config/schema.ts", + "src/config/schema.test.ts", + "config/default.yaml", + "README.md", + "docs/api/TOOLS.md", + "docs/api/PROTOCOL.md", + "docs/architecture/AGENT_DIAGRAM.md", + "docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/tools/builtin/browser/tools.test.ts src/config/schema.test.ts src/tools/policy.test.ts + pnpm typecheck passing" }, "subagents-support-phase1": { "status": "completed", @@ -6830,7 +6852,7 @@ } }, "overall_progress": { - "total_test_count": 2534, + "total_test_count": 2544, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", @@ -6845,7 +6867,7 @@ "tier2_completion": "4/4 (100%) \u2014 inbound webhooks, vector memory search, Dockerfile, heartbeat monitor", "tier3_completion": "5/5 (100%) \u2014 lane queue, credential redaction, web UI token dashboard, xAI (Grok) provider, Voyage AI embeddings", "tier4_completion": "4/4 (100%) \u2014 gateway lock, shell completion, Tailscale Serve/Funnel, DM pairing codes", - "feature_gap_scorecard": "rebaselined 2026-02-26 — channel breadth, setup wizard, baseline browser automation, and full subagent support (`subagent.*` + queue modes + budgets + trace/audit + `/subagents` inspection) are implemented; remaining high-impact personal-assistant gaps center on shipped companion apps (desktop/mobile), voice UX polish, browser workflow reliability primitives, and first-success onboarding funnel optimization.", + "feature_gap_scorecard": "rebaselined 2026-02-26 and updated 2026-02-26 (phase 3) — channel breadth, setup wizard, baseline browser automation, subagent controls, and browser workflow reliability primitives (wait/assert/extract/retries/checkpoints/guardrails/budgets) are implemented; remaining high-impact personal-assistant gaps center on shipped companion apps (desktop/mobile), voice UX polish, and first-success onboarding funnel optimization.", "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 2/2 plans complete \u2014 milestone done", "dashboard_observability": "completed \u2014 service health graphs + core service log viewer added to web UI via observability RPCs and bounded backend sampling", "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", @@ -6878,7 +6900,7 @@ "deeper_surfaces_phase3_companion_canvas_voice": "completed \u2014 companion reconnect resilience (auto-reconnect with backoff, pending-wait cancellation on disconnect), canvas artifact persistence (SQLite-backed store, daemon-restart durability), voice TTS fallback coverage (text-only reply on TTS failure, no dropped responses)", "deeper_surfaces_phase4_rollout": "completed \u2014 phase 4 rollout and operator readiness plan documented: canary rollout plan by feature flag/surface, explicit rollback playbook, operator docs and architecture/protocol docs synchronized", "post_phase_test_fixes": "completed \u2014 fixed 4 test failures introduced by phases 1-3: iOS/Android push listNodes (missing publishHeartbeat before platform-filtered query), server.test agent.send (run_state events now precede done; added sendAndWaitForDone helper), httpBody 413 (req.destroy() closed socket before response could be sent; replaced with Connection: close header on 413 responses)", - "personal_assistant_productization_plan": "proposed \u2014 8-10 week phased roadmap defined (companion MVP surfaces, voice reliability hardening, browser workflow reliability layer, onboarding 2.0 first-success funnel) with measurable exit gates.", + "personal_assistant_productization_plan": "in_progress \u2014 8-10 week phased roadmap active; Phase 3 browser workflow reliability layer shipped (wait/assert/extract/checkpoints + guardrails/retries/budgets). Remaining phases: companion MVP surfaces, voice reliability hardening, and onboarding 2.0 first-success funnel.", "subagents_support": "completed \u2014 subagent phases 1-3 shipped with `subagent.spawn/send/list/cancel/delete/summary`, per-child queue mode (`followup|interrupt`), budgets (`max_turns`, `max_total_tokens`, `turn_timeout_ms`), tool-profile overrides, trace-linked audit events, `/subagents` inspection commands, and focused regression tests." }, "soul_md_and_cron_create": {