From 0548ab3833b81153434808c29b3994e80bfc46bb Mon Sep 17 00:00:00 2001 From: William Valentin Date: Mon, 16 Feb 2026 14:38:01 -0800 Subject: [PATCH] feat(tools): add docx extraction for minio ingestion --- README.md | 4 +-- docs/api/TOOLS.md | 4 +-- docs/plans/state.json | 15 ++++++++++++ src/tools/builtin/minio-ingest.test.ts | 34 +++++++++++++++++++++++++- src/tools/builtin/minio-ingest.ts | 24 ++++++++++++++++-- src/tools/builtin/minio-sync.test.ts | 29 +++++++++++++++++++++- 6 files changed, 102 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 99de387..a722163 100644 --- a/README.md +++ b/README.md @@ -627,8 +627,8 @@ backup: When `backup.minio.enabled` is configured, Flynn also exposes MinIO tools: - `minio.share`: upload a local file to the configured MinIO bucket and return a temporary download URL (`mc share download`) -- `minio.ingest`: read a text-like object (and PDF via `pdftotext` when available) from MinIO and append/replace a memory namespace -- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits (including PDF extraction when available) +- `minio.ingest`: read a text-like object (plus PDF/DOCX via extractor tools when available) from MinIO and append/replace a memory namespace +- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits (including PDF/DOCX extraction when available) ## Kubernetes Tools diff --git a/docs/api/TOOLS.md b/docs/api/TOOLS.md index ba639b6..771581c 100644 --- a/docs/api/TOOLS.md +++ b/docs/api/TOOLS.md @@ -968,7 +968,7 @@ Upload a local file to MinIO and return a temporary presigned download URL. #### `minio.ingest` -Read a text-like object from MinIO (and PDFs when `pdftotext` is available) and write it into a memory namespace. +Read a text-like object from MinIO (and PDF/DOCX via local extraction tools when available) and write it into a memory namespace. ```json { @@ -1010,7 +1010,7 @@ Read a text-like object from MinIO (and PDFs when `pdftotext` is available) and #### `minio.sync` -Sync text-like objects from a MinIO prefix into nested memory namespaces (with PDF extraction when available). +Sync text-like objects from a MinIO prefix into nested memory namespaces (with PDF/DOCX extraction when available). ```json { diff --git a/docs/plans/state.json b/docs/plans/state.json index c98647b..a606647 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -157,6 +157,21 @@ ], "test_status": "pnpm test:run src/tools/builtin/minio-ingest.test.ts src/tools/builtin/minio-sync.test.ts + pnpm typecheck passing" }, + "minio-docx-ingestion-support": { + "status": "completed", + "date": "2026-02-16", + "updated": "2026-02-16", + "summary": "Extended MinIO knowledge ingestion/sync to support DOCX extraction with fallback chain (`pandoc` then `docx2txt`) in both `minio.ingest` and `minio.sync` paths, plus tests/docs updates.", + "files_modified": [ + "src/tools/builtin/minio-ingest.ts", + "src/tools/builtin/minio-ingest.test.ts", + "src/tools/builtin/minio-sync.test.ts", + "README.md", + "docs/api/TOOLS.md", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/tools/builtin/minio-ingest.test.ts src/tools/builtin/minio-sync.test.ts + pnpm typecheck passing" + }, "backup-session-summary-audit-trail": { "status": "completed", "date": "2026-02-16", diff --git a/src/tools/builtin/minio-ingest.test.ts b/src/tools/builtin/minio-ingest.test.ts index 604bce4..b65729d 100644 --- a/src/tools/builtin/minio-ingest.test.ts +++ b/src/tools/builtin/minio-ingest.test.ts @@ -32,6 +32,7 @@ describe('minio ingest internals', () => { expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true); expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true); expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true); + expect(minioIngestInternals.isLikelyTextObject('manual.docx')).toBe(true); }); it('rejects likely binary extensions', () => { @@ -79,7 +80,7 @@ describe('createMinioIngestTool', () => { const execRunner = vi.fn(); const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner }); - const result = await tool.execute({ object_key: 'knowledge/diagram.docx' }); + const result = await tool.execute({ object_key: 'knowledge/diagram.exe' }); expect(result.success).toBe(false); expect(result.error).toContain('Unsupported object type'); expect(execRunner).not.toHaveBeenCalled(); @@ -121,6 +122,37 @@ describe('createMinioIngestTool', () => { ); }); + it('extracts DOCX text with pandoc', async () => { + const write = vi.fn(); + const store = { write } as unknown as MemoryStore; + const execRunner = vi.fn(async (_file: string, args: string[]) => { + if (_file === 'mc' && args[0] === 'cp') { + return { stdout: '', stderr: '' }; + } + if (_file === 'pandoc') { + return { stdout: 'Extracted DOCX text', stderr: '' }; + } + return { stdout: '', stderr: '' }; + }); + const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner }); + + const result = await tool.execute({ + object_key: 'knowledge/spec.docx', + mode: 'replace', + }); + expect(result.success).toBe(true); + expect(write).toHaveBeenCalledWith( + 'global/knowledge', + expect.stringContaining('Extracted DOCX text'), + 'replace', + ); + expect(execRunner).toHaveBeenCalledWith( + 'pandoc', + expect.arrayContaining(['-t', 'plain']), + expect.any(Object), + ); + }); + it('allows non-text extension when force=true', async () => { const write = vi.fn(); const store = { write } as unknown as MemoryStore; diff --git a/src/tools/builtin/minio-ingest.ts b/src/tools/builtin/minio-ingest.ts index 0a03013..fefc4c6 100644 --- a/src/tools/builtin/minio-ingest.ts +++ b/src/tools/builtin/minio-ingest.ts @@ -34,6 +34,7 @@ const TEXT_EXTENSIONS = new Set([ ]); const EXTRACTABLE_BINARY_EXTENSIONS = new Set([ '.pdf', + '.docx', ]); export interface MinioIngestDeps { @@ -61,6 +62,7 @@ async function readObjectText( objectKey: string, env: NodeJS.ProcessEnv, ): Promise { + const ext = extname(objectKey).toLowerCase(); if (!isExtractableBinaryObject(objectKey)) { const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 }); return toText(stdout); @@ -70,8 +72,26 @@ async function readObjectText( const localPath = join(tempDir, 'object.bin'); try { await runner('mc', ['cp', remotePath, localPath], { env, maxBuffer: 20 * 1024 * 1024 }); - const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 }); - return toText(stdout); + if (ext === '.pdf') { + const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 }); + return toText(stdout); + } + if (ext === '.docx') { + try { + const { stdout } = await runner('pandoc', [localPath, '-t', 'plain'], { maxBuffer: 20 * 1024 * 1024 }); + return toText(stdout); + } catch (pandocError) { + try { + const { stdout } = await runner('docx2txt', [localPath, '-'], { maxBuffer: 20 * 1024 * 1024 }); + return toText(stdout); + } catch (docx2txtError) { + const pErr = pandocError instanceof Error ? pandocError.message : String(pandocError); + const dErr = docx2txtError instanceof Error ? docx2txtError.message : String(docx2txtError); + throw new Error(`Failed to extract DOCX text (pandoc/docx2txt unavailable or failed): ${pErr}; ${dErr}`); + } + } + } + throw new Error(`Unsupported extractable object type: ${objectKey}`); } finally { rmSync(tempDir, { recursive: true, force: true }); } diff --git a/src/tools/builtin/minio-sync.test.ts b/src/tools/builtin/minio-sync.test.ts index c8fca2b..3a26992 100644 --- a/src/tools/builtin/minio-sync.test.ts +++ b/src/tools/builtin/minio-sync.test.ts @@ -104,7 +104,7 @@ describe('createMinioSyncTool', () => { const store = { write } as unknown as MemoryStore; const execRunner = vi.fn(async (_file: string, args: string[]) => { if (args[0] === 'ls') { - return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.docx"}', stderr: '' }; + return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.exe"}', stderr: '' }; } return { stdout: 'fake text', stderr: '' }; }); @@ -117,6 +117,33 @@ describe('createMinioSyncTool', () => { expect(write).not.toHaveBeenCalled(); }); + it('syncs DOCX objects via text extraction', async () => { + const write = vi.fn(); + const store = { write } as unknown as MemoryStore; + const execRunner = vi.fn(async (_file: string, args: string[]) => { + if (_file === 'mc' && args[0] === 'ls') { + return { stdout: '{"status":"success","type":"file","key":"knowledge/spec.docx"}', stderr: '' }; + } + if (_file === 'mc' && args[0] === 'cp') { + return { stdout: '', stderr: '' }; + } + if (_file === 'pandoc') { + return { stdout: 'DOCX sync text', stderr: '' }; + } + return { stdout: '', stderr: '' }; + }); + + const tool = createMinioSyncTool(makeBackupConfig(), store, { execRunner }); + const result = await tool.execute({ prefix: 'knowledge/' }); + expect(result.success).toBe(true); + expect(result.output).toContain('Imported: 1'); + expect(write).toHaveBeenCalledWith( + 'global/knowledge/minio/knowledge/spec', + expect.stringContaining('DOCX sync text'), + 'append', + ); + }); + it('returns an error when minio is disabled', async () => { const write = vi.fn(); const store = { write } as unknown as MemoryStore;