feat(memory): add experimental qmd search backend

2026-02-15 19:33:43 -08:00
parent c6e3d09ecc
commit 81c97a9df1
14 changed files with 340 additions and 29 deletions
@@ -626,6 +626,10 @@ memory:
    chunk_overlap: 50                      # Overlap between chunks
    top_k: 5                               # Top results from vector search
    hybrid_weight: 0.7                     # 0.0 = keyword only, 1.0 = vector only
+  qmd:
+    enabled: false                         # Experimental markdown-native search backend
+    top_k: 8                               # Max QMD results
+    min_score: 0.15                        # Minimum match score (0-1)
 ```

 ### Embedding Providers
@@ -640,7 +644,13 @@ memory:

 Embeddings are indexed in the background — when memory is written, the namespace is marked dirty and re-indexed within 30 seconds. The vector index is stored in `vectors.db` alongside the session database.

-When embeddings are disabled or the provider is unreachable, search falls back gracefully to keyword matching.
+Search backend selection:
+
+- `memory.embedding.enabled: true` -> hybrid keyword+vector backend
+- `memory.embedding.enabled: false` and `memory.qmd.enabled: true` -> QMD markdown backend
+- otherwise -> keyword-only fallback
+
+When the selected backend is unavailable (for example embedding provider errors), search falls back gracefully to keyword matching.

 ### Embedding Config Fields

@@ -657,6 +667,14 @@ When embeddings are disabled or the provider is unreachable, search falls back g
 | `top_k` | no | Number of vector results to return (default: `5`) |
 | `hybrid_weight` | no | Vector vs keyword weight, 0.0-1.0 (default: `0.7`) |

+### QMD Config Fields
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `enabled` | no | Enable experimental markdown-native QMD backend (default: `false`) |
+| `top_k` | no | Max QMD results returned by `memory.search` (default: `8`) |
+| `min_score` | no | Minimum relevance score (0.0-1.0) for QMD matches (default: `0.15`) |
+
 ## Gateway Lock

 Single-client mode for the WebSocket gateway. When enabled, only one WebSocket connection is allowed at a time. Additional connections are rejected with close code `4003`.
@@ -123,7 +123,7 @@ Flynn has **6 of ~15 channels** (Telegram, WhatsApp, Discord, Slack, WebChat, TU
 | `memory.write` tool | Write memory files | Full (write/append to namespace) | **MATCH** |
 | Vector embeddings | OpenAI/Gemini/local | Full (OpenAI, Gemini, Ollama, LlamaCpp providers) | **MATCH** |
 | Hybrid search (BM25 + vector) | Full | Full (keyword + vector with configurable hybrid weight) | **MATCH** |
-| QMD backend | Experimental | -- | **MISSING** |
+| QMD backend | Experimental | Full (experimental markdown-native backend configurable via `memory.qmd`) | **MATCH** |

 ---

@@ -313,7 +313,6 @@ All five Tier 3 items implemented: Lane Queue (per-session FIFO in gateway), cre
 - Elevated mode — sandbox escape hatch
 - ~~Onboard wizard — guided setup~~ (DONE — `flynn setup` + first-run auto-trigger, 2026-02-10)
 - ClawHub/skill registry — community marketplace
- QMD backend — experimental memory search

 ---

@@ -39,9 +39,9 @@ A gap item is considered implemented when:

 - Canvas / A2UI (agent-driven visual workspace)

-### Memory (MISSING)
+### Memory

- QMD backend (experimental)
+- QMD backend (experimental) — completed on 2026-02-16

 ### Security (MISSING)

@@ -0,0 +1,39 @@
+# QMD Backend Checklist
+
+Date: 2026-02-16
+Status: completed
+
+## Scope
+
+- Add an experimental QMD (query markdown database) backend for `memory.search`.
+- Enable config-driven backend selection between hybrid embeddings, QMD, and keyword fallback.
+- Update docs and plan state.
+
+## Completed
+
+- Added `memory.qmd` config schema in `src/config/schema.ts`:
+  - `enabled` (default `false`)
+  - `top_k` (default `8`)
+  - `min_score` (default `0.15`)
+- Implemented `QmdSearch` backend in `src/memory/qmd-search.ts`:
+  - heading-aware scoring
+  - token overlap + phrase bonus ranking
+  - session namespace recency boost
+- Wired backend selection in `src/daemon/memory.ts`:
+  - embedding enabled -> hybrid backend
+  - else if qmd enabled -> QMD backend
+  - else keyword-only search
+- Generalized memory search tool wiring:
+  - introduced shared backend interface for `memory.search`
+  - updated memory tool factory to accept any backend implementing `search(query, topK?)`
+- Updated docs:
+  - README memory section now documents QMD config and backend precedence.
+  - OpenClaw gap docs updated to mark QMD backend as implemented.
+- Added tests:
+  - `src/memory/qmd-search.test.ts`
+  - `src/config/schema.test.ts` coverage for `memory.qmd`
+
+## Verification
+
+- `pnpm test:run src/config/schema.test.ts src/memory/qmd-search.test.ts`
+- `pnpm typecheck`
@@ -223,6 +223,31 @@
      ],
      "test_status": "pnpm test:run src/channels/registry.test.ts src/gateway/handlers/handlers.test.ts + pnpm typecheck passing"
    },
+    "qmd-backend": {
+      "file": "2026-02-16-qmd-backend-checklist.md",
+      "status": "completed",
+      "date": "2026-02-16",
+      "updated": "2026-02-16",
+      "summary": "Added an experimental markdown-native QMD backend for memory.search with config-driven backend selection (hybrid embeddings -> QMD -> keyword fallback), tests, and docs updates.",
+      "files_created": [
+        "docs/plans/2026-02-16-qmd-backend-checklist.md",
+        "src/memory/qmd-search.ts",
+        "src/memory/qmd-search.test.ts"
+      ],
+      "files_modified": [
+        "src/config/schema.ts",
+        "src/config/schema.test.ts",
+        "src/daemon/memory.ts",
+        "src/memory/hybrid-search.ts",
+        "src/memory/index.ts",
+        "src/tools/builtin/index.ts",
+        "src/tools/builtin/memory-search.ts",
+        "README.md",
+        "docs/plans/2026-02-06-openclaw-feature-gap-analysis.md",
+        "docs/plans/2026-02-15-openclaw-gap-roadmap.md"
+      ],
+      "test_status": "pnpm test:run src/config/schema.test.ts src/memory/qmd-search.test.ts + pnpm typecheck passing"
+    },
    "skill-safety-scanner": {
      "file": "2026-02-15-skill-safety-scanner-checklist.md",
      "status": "completed",
@@ -2267,12 +2292,12 @@
    "tier2_completion": "4/4 (100%) — inbound webhooks, vector memory search, Dockerfile, heartbeat monitor",
    "tier3_completion": "5/5 (100%) — lane queue, credential redaction, web UI token dashboard, xAI (Grok) provider, Voyage AI embeddings",
    "tier4_completion": "4/4 (100%) — gateway lock, shell completion, Tailscale Serve/Funnel, DM pairing codes",
-    "feature_gap_scorecard": "107/128 match (84%), 0 partial (0%), 21 missing (16%)",
+    "feature_gap_scorecard": "108/128 match (84%), 0 partial (0%), 20 missing (16%)",
    "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 2/2 plans complete — milestone done",
    "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
    "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback",
    "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
-    "next_up": "Pick the next OpenClaw gap milestone and create a scoped checklist (candidates: QMD backend, ClawHub registry, Bonjour/mDNS discovery)"
+    "next_up": "Pick the next OpenClaw gap milestone and create a scoped checklist (candidates: ClawHub registry, Bonjour/mDNS discovery, synthetic provider)"
  },
  "soul_md_and_cron_create": {
    "date": "2026-02-11",
@@ -459,6 +459,9 @@ describe('configSchema — memory injection strategy', () => {
    const result = configSchema.parse(minimalConfig);
    expect(result.memory.injection_strategy).toBe('all');
    expect(result.memory.max_injection_tokens).toBe(2000);
+    expect(result.memory.qmd.enabled).toBe(false);
+    expect(result.memory.qmd.top_k).toBe(8);
+    expect(result.memory.qmd.min_score).toBe(0.15);
  });

  it('accepts adaptive memory injection settings', () => {
@@ -472,6 +475,22 @@ describe('configSchema — memory injection strategy', () => {
    expect(result.memory.injection_strategy).toBe('adaptive');
    expect(result.memory.max_injection_tokens).toBe(1200);
  });
+
+  it('accepts qmd backend settings', () => {
+    const result = configSchema.parse({
+      ...minimalConfig,
+      memory: {
+        qmd: {
+          enabled: true,
+          top_k: 12,
+          min_score: 0.2,
+        },
+      },
+    });
+    expect(result.memory.qmd.enabled).toBe(true);
+    expect(result.memory.qmd.top_k).toBe(12);
+    expect(result.memory.qmd.min_score).toBe(0.2);
+  });
 });

 describe('configSchema — compaction importance threshold', () => {
@@ -313,6 +313,15 @@ const embeddingSchema = z.object({
  hybrid_weight: z.number().min(0).max(1).default(0.7),
 }).default({});

+const qmdSchema = z.object({
+  /** Enable experimental QMD (query markdown database) memory search backend. */
+  enabled: z.boolean().default(false),
+  /** Maximum number of QMD results returned by memory.search. */
+  top_k: z.number().min(1).max(50).default(8),
+  /** Minimum relevance score (0-1) for QMD matches. */
+  min_score: z.number().min(0).max(1).default(0.15),
+}).default({});
+
 const memorySchema = z.object({
  enabled: z.boolean().default(true),
  dir: z.string().optional(), // Default: ~/.local/share/flynn/memory
@@ -321,6 +330,7 @@ const memorySchema = z.object({
  max_injection_tokens: z.number().min(100).max(10000).default(2000),
  max_context_tokens: z.number().min(100).max(10000).default(2000),
  embedding: embeddingSchema,
+  qmd: qmdSchema,
 }).default({});

 const compactionSchema = z.object({
@@ -593,6 +603,7 @@ export type HeartbeatConfig = z.infer<typeof heartbeatSchema>;
 export type HeartbeatCheck = z.infer<typeof heartbeatCheckSchema>;
 export type EmbeddingConfig = z.infer<typeof embeddingSchema>;
 export type EmbeddingProvider = z.infer<typeof embeddingProviderSchema>;
+export type QmdConfig = z.infer<typeof qmdSchema>;
 export type GcalConfig = z.infer<typeof gcalSchema>;
 export type GdocsConfig = z.infer<typeof gdocsSchema>;
 export type GdriveConfig = z.infer<typeof gdriveSchema>;
@@ -1,8 +1,9 @@
 import type { Config } from '../config/index.js';
 import type { Lifecycle } from './lifecycle.js';
 import { MemoryStore } from '../memory/index.js';
-import { VectorStore, HybridSearch, createEmbeddingProvider, chunkText, contentHash } from '../memory/index.js';
+import { VectorStore, HybridSearch, QmdSearch, createEmbeddingProvider, chunkText, contentHash } from '../memory/index.js';
 import type { EmbeddingProvider as EmbeddingProviderInterface } from '../memory/index.js';
+import type { MemorySearchBackend } from '../tools/builtin/memory-search.js';
 import { createMemoryTools } from '../tools/builtin/index.js';
 import type { ToolRegistry } from '../tools/index.js';
 import { resolve } from 'path';
@@ -17,7 +18,7 @@ export interface MemoryDeps {

 export interface MemoryResult {
  memoryStore?: MemoryStore;
-  hybridSearch?: HybridSearch;
+  searchBackend?: MemorySearchBackend;
  memoryDir: string;
 }

@@ -32,18 +33,19 @@ export async function initMemory(deps: MemoryDeps): Promise<MemoryResult> {
    : undefined;

  // Register memory tools if memory is enabled
-  let hybridSearch: HybridSearch | undefined;
+  let searchBackend: MemorySearchBackend | undefined;

  if (memoryStore && config.memory.embedding.enabled) {
    try {
      const embeddingProvider: EmbeddingProviderInterface = createEmbeddingProvider(config.memory.embedding);
      const vectorStore = new VectorStore(resolve(dataDir, 'vectors.db'));
-      hybridSearch = new HybridSearch(
+      const hybridSearch = new HybridSearch(
        memoryStore,
        vectorStore,
        embeddingProvider,
        config.memory.embedding.hybrid_weight,
      );
+      searchBackend = hybridSearch;

      // Background indexer: re-embed dirty namespaces every 30 seconds
      const indexerInterval = setInterval(async () => {
@@ -89,11 +91,19 @@ export async function initMemory(deps: MemoryDeps): Promise<MemoryResult> {
    }
  }

+  if (!searchBackend && memoryStore && config.memory.qmd.enabled) {
+    searchBackend = new QmdSearch(memoryStore, {
+      topK: config.memory.qmd.top_k,
+      minScore: config.memory.qmd.min_score,
+    });
+    console.log(`QMD memory search enabled (top_k=${config.memory.qmd.top_k}, min_score=${config.memory.qmd.min_score})`);
+  }
+
  if (memoryStore) {
-    for (const tool of createMemoryTools(memoryStore, hybridSearch)) {
+    for (const tool of createMemoryTools(memoryStore, searchBackend)) {
      toolRegistry.register(tool);
    }
  }

-  return { memoryStore, hybridSearch, memoryDir };
+  return { memoryStore, searchBackend, memoryDir };
 }
@@ -20,8 +20,8 @@ export interface HybridSearchResult {
  line: number;
  /** Combined relevance score (0-1). */
  score: number;
-  /** Source of the match: keyword, vector, or both. */
-  source: 'keyword' | 'vector' | 'both';
+  /** Source of the match: keyword, vector, qmd, or both. */
+  source: 'keyword' | 'vector' | 'qmd' | 'both';
 }

 /**
@@ -8,6 +8,8 @@ export { VectorStore, cosineSimilarity, contentHash } from './vector-store.js';
 export type { VectorSearchResult, EmbeddingRow } from './vector-store.js';
 export { HybridSearch } from './hybrid-search.js';
 export type { HybridSearchResult } from './hybrid-search.js';
+export { QmdSearch } from './qmd-search.js';
+export type { QmdSearchOptions } from './qmd-search.js';
 export * from './categories.js';
 export { buildAdaptiveMemoryContext, buildRecentMemoryContext } from './adaptive.js';
 export type { AdaptiveMemoryConfig } from './adaptive.js';
@@ -0,0 +1,51 @@
+import { describe, expect, it } from 'vitest';
+import { mkdtempSync, rmSync } from 'fs';
+import { join } from 'path';
+import { tmpdir } from 'os';
+import { MemoryStore } from './store.js';
+import { QmdSearch } from './qmd-search.js';
+
+describe('QmdSearch', () => {
+  it('finds relevant markdown lines with heading-aware scoring', async () => {
+    const dir = mkdtempSync(join(tmpdir(), 'flynn-qmd-search-'));
+    try {
+      const store = new MemoryStore({ dir, maxContextTokens: 2000 });
+      store.write(
+        'user',
+        [
+          '# Preferences',
+          '- Favorite editor is Neovim',
+          '- Uses TypeScript daily',
+          '',
+          '# Projects',
+          '- QMD backend prototype for memory search',
+        ].join('\n'),
+        'replace',
+      );
+      store.write('sessions/abc123', '- Discussed QMD ranking for markdown memory.', 'replace');
+
+      const qmd = new QmdSearch(store, { topK: 5, minScore: 0.1 });
+      const results = await qmd.search('qmd memory search');
+
+      expect(results.length).toBeGreaterThan(0);
+      expect(results[0].source).toBe('qmd');
+      expect(results.some((r) => r.namespace === 'user')).toBe(true);
+      expect(results.some((r) => r.namespace === 'sessions/abc123')).toBe(true);
+    } finally {
+      rmSync(dir, { recursive: true, force: true });
+    }
+  });
+
+  it('returns empty results for empty query', async () => {
+    const dir = mkdtempSync(join(tmpdir(), 'flynn-qmd-search-'));
+    try {
+      const store = new MemoryStore({ dir, maxContextTokens: 2000 });
+      store.write('user', 'hello world', 'replace');
+      const qmd = new QmdSearch(store);
+      const results = await qmd.search('   ');
+      expect(results).toEqual([]);
+    } finally {
+      rmSync(dir, { recursive: true, force: true });
+    }
+  });
+});
@@ -0,0 +1,132 @@
+import type { MemoryStore } from './store.js';
+import type { HybridSearchResult } from './hybrid-search.js';
+
+export interface QmdSearchOptions {
+  topK?: number;
+  minScore?: number;
+}
+
+/**
+ * Experimental QMD (query markdown database) search backend.
+ *
+ * QMD treats markdown memory as structured text:
+ * - heading lines contribute topical boosts
+ * - line-level query token overlap is scored
+ * - exact phrase match receives an additional boost
+ */
+export class QmdSearch {
+  private _store: MemoryStore;
+  private _topK: number;
+  private _minScore: number;
+
+  constructor(store: MemoryStore, options?: QmdSearchOptions) {
+    this._store = store;
+    this._topK = options?.topK ?? 8;
+    this._minScore = options?.minScore ?? 0.15;
+  }
+
+  async search(query: string, topK?: number): Promise<HybridSearchResult[]> {
+    const queryText = query.trim().toLowerCase();
+    if (queryText.length === 0) {
+      return [];
+    }
+
+    const queryTokens = tokenize(queryText);
+    if (queryTokens.length === 0) {
+      return [];
+    }
+
+    const results: HybridSearchResult[] = [];
+    for (const namespace of this._store.listNamespaces()) {
+      const content = this._store.read(namespace);
+      if (content.length === 0) {
+        continue;
+      }
+
+      const lines = content.split('\n');
+      let currentHeading = '';
+
+      for (let i = 0; i < lines.length; i++) {
+        const raw = lines[i];
+        const line = raw.trim();
+        if (line.length === 0) {
+          continue;
+        }
+
+        const heading = line.match(/^#{1,6}\s+(.+)$/);
+        if (heading) {
+          currentHeading = heading[1].toLowerCase();
+          continue;
+        }
+
+        const score = scoreLine(line, queryText, queryTokens, currentHeading, namespace);
+        if (score < this._minScore) {
+          continue;
+        }
+
+        const contextParts: string[] = [];
+        if (i > 0 && lines[i - 1].trim().length > 0) {
+          contextParts.push(lines[i - 1]);
+        }
+        contextParts.push(raw);
+        if (i < lines.length - 1 && lines[i + 1].trim().length > 0) {
+          contextParts.push(lines[i + 1]);
+        }
+
+        results.push({
+          namespace,
+          content: raw,
+          context: contextParts.join('\n'),
+          line: i + 1,
+          score,
+          source: 'qmd',
+        });
+      }
+    }
+
+    results.sort((a, b) => b.score - a.score);
+    return results.slice(0, topK ?? this._topK);
+  }
+}
+
+function tokenize(text: string): string[] {
+  return text
+    .split(/[^a-z0-9]+/i)
+    .map((token) => token.trim().toLowerCase())
+    .filter((token) => token.length >= 2);
+}
+
+function scoreLine(
+  line: string,
+  queryText: string,
+  queryTokens: string[],
+  currentHeading: string,
+  namespace: string,
+): number {
+  const lineText = line.toLowerCase();
+  const lineTokens = new Set(tokenize(lineText));
+  const headingTokens = new Set(tokenize(currentHeading));
+
+  let overlap = 0;
+  for (const token of queryTokens) {
+    if (lineTokens.has(token)) {
+      overlap += 1;
+    }
+  }
+
+  const overlapScore = overlap / queryTokens.length; // 0..1
+  const phraseBonus = lineText.includes(queryText) ? 0.25 : 0;
+
+  let headingBonus = 0;
+  for (const token of queryTokens) {
+    if (headingTokens.has(token)) {
+      headingBonus += 0.08;
+    }
+  }
+  headingBonus = Math.min(0.25, headingBonus);
+
+  // Session-scoped memories often represent recent conversational facts.
+  const recencyBonus = namespace.startsWith('sessions/') ? 0.05 : 0;
+
+  return Math.min(1, overlapScore + phraseBonus + headingBonus + recencyBonus);
+}
@@ -30,8 +30,8 @@ export { createGtasksTools } from './gtasks.js';

 import type { Tool } from '../types.js';
 import type { MemoryStore } from '../../memory/store.js';
-import type { HybridSearch } from '../../memory/hybrid-search.js';
 import type { WebSearchConfig } from './web-search.js';
+import type { MemorySearchBackend } from './memory-search.js';
 import { shellExecTool } from './shell.js';
 import { fileReadTool } from './file-read.js';
 import { fileWriteTool } from './file-write.js';
@@ -60,11 +60,11 @@ export const allBuiltinTools: Tool[] = [
 ];

 /** Create memory tools that require a MemoryStore instance. */
-export function createMemoryTools(store: MemoryStore, hybridSearch?: HybridSearch): Tool[] {
+export function createMemoryTools(store: MemoryStore, searchBackend?: MemorySearchBackend): Tool[] {
  return [
    createMemoryReadTool(store),
    createMemoryWriteTool(store),
-    createMemorySearchTool(store, hybridSearch),
+    createMemorySearchTool(store, searchBackend),
  ];
 }

@@ -1,22 +1,26 @@
 import type { Tool, ToolResult } from '../types.js';
 import type { MemoryStore } from '../../memory/store.js';
-import type { HybridSearch } from '../../memory/hybrid-search.js';
+import type { HybridSearchResult } from '../../memory/hybrid-search.js';

 interface MemorySearchArgs {
  query: string;
 }

+export interface MemorySearchBackend {
+  search(query: string, topK?: number): Promise<HybridSearchResult[]>;
+}
+
 /**
 * Creates a memory.search tool bound to the given MemoryStore instance.
- * When a HybridSearch instance is provided, uses vector + keyword search;
+ * When a search backend is provided, uses backend-assisted search;
 * otherwise falls back to keyword-only search.
 */
-export function createMemorySearchTool(store: MemoryStore, hybridSearch?: HybridSearch): Tool {
+export function createMemorySearchTool(store: MemoryStore, searchBackend?: MemorySearchBackend): Tool {
  return {
    name: 'memory.search',
    description:
      'Search across all memory files for a keyword or phrase. Returns matching lines with surrounding context from every namespace.' +
-      (hybridSearch ? ' Uses semantic vector search combined with keyword matching for better results.' : '') +
+      (searchBackend ? ' Uses an enhanced search backend (hybrid vector/keyword or QMD) when configured.' : '') +
      ' Category namespaces (facts/preferences/decisions/projects) are searchable through the namespace path.',
    inputSchema: {
      type: 'object',
@@ -32,10 +36,10 @@ export function createMemorySearchTool(store: MemoryStore, hybridSearch?: Hybrid
      const args = rawArgs as MemorySearchArgs;

      try {
-        // Try hybrid search first if available
-        if (hybridSearch) {
+        // Try enhanced search backend first if available
+        if (searchBackend) {
          try {
-            const results = await hybridSearch.search(args.query);
+            const results = await searchBackend.search(args.query);

            if (results.length === 0) {
              return { success: true, output: `No matches found for "${args.query}".` };
@@ -44,6 +48,7 @@ export function createMemorySearchTool(store: MemoryStore, hybridSearch?: Hybrid
            const formatted = results.map((result) => {
              const sourceLabel = result.source === 'both' ? 'keyword+vector'
                : result.source === 'vector' ? 'vector'
+                  : result.source === 'qmd' ? 'qmd'
                    : 'keyword';
              return `[${result.namespace}:${result.line}] (${sourceLabel}, score: ${result.score.toFixed(3)}) ${result.content}\n  context: ${result.context}`;
            }).join('\n\n');
@@ -52,9 +57,9 @@ export function createMemorySearchTool(store: MemoryStore, hybridSearch?: Hybrid
              success: true,
              output: `Found ${results.length} match${results.length === 1 ? '' : 'es'} for "${args.query}":\n\n${formatted}`,
            };
-          } catch (hybridError) {
-            // Fall back to keyword search on hybrid failure
-            console.error('Hybrid search failed, falling back to keyword search:', hybridError);
+          } catch (backendError) {
+            // Fall back to keyword search on backend failure
+            console.error('Enhanced memory search backend failed, falling back to keyword search:', backendError);
          }
        }