feat(tools): add docx extraction for minio ingestion

This commit is contained in:
William Valentin
2026-02-16 14:38:01 -08:00
parent e8a785b61f
commit 0548ab3833
6 changed files with 102 additions and 8 deletions
+2 -2
View File
@@ -627,8 +627,8 @@ backup:
When `backup.minio.enabled` is configured, Flynn also exposes MinIO tools:
- `minio.share`: upload a local file to the configured MinIO bucket and return a temporary download URL (`mc share download`)
- `minio.ingest`: read a text-like object (and PDF via `pdftotext` when available) from MinIO and append/replace a memory namespace
- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits (including PDF extraction when available)
- `minio.ingest`: read a text-like object (plus PDF/DOCX via extractor tools when available) from MinIO and append/replace a memory namespace
- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits (including PDF/DOCX extraction when available)
## Kubernetes Tools
+2 -2
View File
@@ -968,7 +968,7 @@ Upload a local file to MinIO and return a temporary presigned download URL.
#### `minio.ingest`
Read a text-like object from MinIO (and PDFs when `pdftotext` is available) and write it into a memory namespace.
Read a text-like object from MinIO (and PDF/DOCX via local extraction tools when available) and write it into a memory namespace.
```json
{
@@ -1010,7 +1010,7 @@ Read a text-like object from MinIO (and PDFs when `pdftotext` is available) and
#### `minio.sync`
Sync text-like objects from a MinIO prefix into nested memory namespaces (with PDF extraction when available).
Sync text-like objects from a MinIO prefix into nested memory namespaces (with PDF/DOCX extraction when available).
```json
{
+15
View File
@@ -157,6 +157,21 @@
],
"test_status": "pnpm test:run src/tools/builtin/minio-ingest.test.ts src/tools/builtin/minio-sync.test.ts + pnpm typecheck passing"
},
"minio-docx-ingestion-support": {
"status": "completed",
"date": "2026-02-16",
"updated": "2026-02-16",
"summary": "Extended MinIO knowledge ingestion/sync to support DOCX extraction with fallback chain (`pandoc` then `docx2txt`) in both `minio.ingest` and `minio.sync` paths, plus tests/docs updates.",
"files_modified": [
"src/tools/builtin/minio-ingest.ts",
"src/tools/builtin/minio-ingest.test.ts",
"src/tools/builtin/minio-sync.test.ts",
"README.md",
"docs/api/TOOLS.md",
"docs/plans/state.json"
],
"test_status": "pnpm test:run src/tools/builtin/minio-ingest.test.ts src/tools/builtin/minio-sync.test.ts + pnpm typecheck passing"
},
"backup-session-summary-audit-trail": {
"status": "completed",
"date": "2026-02-16",
+33 -1
View File
@@ -32,6 +32,7 @@ describe('minio ingest internals', () => {
expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('manual.docx')).toBe(true);
});
it('rejects likely binary extensions', () => {
@@ -79,7 +80,7 @@ describe('createMinioIngestTool', () => {
const execRunner = vi.fn();
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
const result = await tool.execute({ object_key: 'knowledge/diagram.docx' });
const result = await tool.execute({ object_key: 'knowledge/diagram.exe' });
expect(result.success).toBe(false);
expect(result.error).toContain('Unsupported object type');
expect(execRunner).not.toHaveBeenCalled();
@@ -121,6 +122,37 @@ describe('createMinioIngestTool', () => {
);
});
it('extracts DOCX text with pandoc', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;
const execRunner = vi.fn(async (_file: string, args: string[]) => {
if (_file === 'mc' && args[0] === 'cp') {
return { stdout: '', stderr: '' };
}
if (_file === 'pandoc') {
return { stdout: 'Extracted DOCX text', stderr: '' };
}
return { stdout: '', stderr: '' };
});
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
const result = await tool.execute({
object_key: 'knowledge/spec.docx',
mode: 'replace',
});
expect(result.success).toBe(true);
expect(write).toHaveBeenCalledWith(
'global/knowledge',
expect.stringContaining('Extracted DOCX text'),
'replace',
);
expect(execRunner).toHaveBeenCalledWith(
'pandoc',
expect.arrayContaining(['-t', 'plain']),
expect.any(Object),
);
});
it('allows non-text extension when force=true', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;
+22 -2
View File
@@ -34,6 +34,7 @@ const TEXT_EXTENSIONS = new Set([
]);
const EXTRACTABLE_BINARY_EXTENSIONS = new Set([
'.pdf',
'.docx',
]);
export interface MinioIngestDeps {
@@ -61,6 +62,7 @@ async function readObjectText(
objectKey: string,
env: NodeJS.ProcessEnv,
): Promise<string> {
const ext = extname(objectKey).toLowerCase();
if (!isExtractableBinaryObject(objectKey)) {
const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 });
return toText(stdout);
@@ -70,8 +72,26 @@ async function readObjectText(
const localPath = join(tempDir, 'object.bin');
try {
await runner('mc', ['cp', remotePath, localPath], { env, maxBuffer: 20 * 1024 * 1024 });
const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
return toText(stdout);
if (ext === '.pdf') {
const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
return toText(stdout);
}
if (ext === '.docx') {
try {
const { stdout } = await runner('pandoc', [localPath, '-t', 'plain'], { maxBuffer: 20 * 1024 * 1024 });
return toText(stdout);
} catch (pandocError) {
try {
const { stdout } = await runner('docx2txt', [localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
return toText(stdout);
} catch (docx2txtError) {
const pErr = pandocError instanceof Error ? pandocError.message : String(pandocError);
const dErr = docx2txtError instanceof Error ? docx2txtError.message : String(docx2txtError);
throw new Error(`Failed to extract DOCX text (pandoc/docx2txt unavailable or failed): ${pErr}; ${dErr}`);
}
}
}
throw new Error(`Unsupported extractable object type: ${objectKey}`);
} finally {
rmSync(tempDir, { recursive: true, force: true });
}
+28 -1
View File
@@ -104,7 +104,7 @@ describe('createMinioSyncTool', () => {
const store = { write } as unknown as MemoryStore;
const execRunner = vi.fn(async (_file: string, args: string[]) => {
if (args[0] === 'ls') {
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.docx"}', stderr: '' };
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.exe"}', stderr: '' };
}
return { stdout: 'fake text', stderr: '' };
});
@@ -117,6 +117,33 @@ describe('createMinioSyncTool', () => {
expect(write).not.toHaveBeenCalled();
});
it('syncs DOCX objects via text extraction', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;
const execRunner = vi.fn(async (_file: string, args: string[]) => {
if (_file === 'mc' && args[0] === 'ls') {
return { stdout: '{"status":"success","type":"file","key":"knowledge/spec.docx"}', stderr: '' };
}
if (_file === 'mc' && args[0] === 'cp') {
return { stdout: '', stderr: '' };
}
if (_file === 'pandoc') {
return { stdout: 'DOCX sync text', stderr: '' };
}
return { stdout: '', stderr: '' };
});
const tool = createMinioSyncTool(makeBackupConfig(), store, { execRunner });
const result = await tool.execute({ prefix: 'knowledge/' });
expect(result.success).toBe(true);
expect(result.output).toContain('Imported: 1');
expect(write).toHaveBeenCalledWith(
'global/knowledge/minio/knowledge/spec',
expect.stringContaining('DOCX sync text'),
'append',
);
});
it('returns an error when minio is disabled', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;