feat(tools): add docx extraction for minio ingestion

This commit is contained in:
William Valentin
2026-02-16 14:38:01 -08:00
parent e8a785b61f
commit 0548ab3833
6 changed files with 102 additions and 8 deletions
+33 -1
View File
@@ -32,6 +32,7 @@ describe('minio ingest internals', () => {
expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('manual.docx')).toBe(true);
});
it('rejects likely binary extensions', () => {
@@ -79,7 +80,7 @@ describe('createMinioIngestTool', () => {
const execRunner = vi.fn();
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
const result = await tool.execute({ object_key: 'knowledge/diagram.docx' });
const result = await tool.execute({ object_key: 'knowledge/diagram.exe' });
expect(result.success).toBe(false);
expect(result.error).toContain('Unsupported object type');
expect(execRunner).not.toHaveBeenCalled();
@@ -121,6 +122,37 @@ describe('createMinioIngestTool', () => {
);
});
it('extracts DOCX text with pandoc', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;
const execRunner = vi.fn(async (_file: string, args: string[]) => {
if (_file === 'mc' && args[0] === 'cp') {
return { stdout: '', stderr: '' };
}
if (_file === 'pandoc') {
return { stdout: 'Extracted DOCX text', stderr: '' };
}
return { stdout: '', stderr: '' };
});
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
const result = await tool.execute({
object_key: 'knowledge/spec.docx',
mode: 'replace',
});
expect(result.success).toBe(true);
expect(write).toHaveBeenCalledWith(
'global/knowledge',
expect.stringContaining('Extracted DOCX text'),
'replace',
);
expect(execRunner).toHaveBeenCalledWith(
'pandoc',
expect.arrayContaining(['-t', 'plain']),
expect.any(Object),
);
});
it('allows non-text extension when force=true', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;