feat(tools): add docx extraction for minio ingestion
This commit is contained in:
@@ -32,6 +32,7 @@ describe('minio ingest internals', () => {
|
||||
expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true);
|
||||
expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true);
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true);
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.docx')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects likely binary extensions', () => {
|
||||
@@ -79,7 +80,7 @@ describe('createMinioIngestTool', () => {
|
||||
const execRunner = vi.fn();
|
||||
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
|
||||
|
||||
const result = await tool.execute({ object_key: 'knowledge/diagram.docx' });
|
||||
const result = await tool.execute({ object_key: 'knowledge/diagram.exe' });
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain('Unsupported object type');
|
||||
expect(execRunner).not.toHaveBeenCalled();
|
||||
@@ -121,6 +122,37 @@ describe('createMinioIngestTool', () => {
|
||||
);
|
||||
});
|
||||
|
||||
it('extracts DOCX text with pandoc', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn(async (_file: string, args: string[]) => {
|
||||
if (_file === 'mc' && args[0] === 'cp') {
|
||||
return { stdout: '', stderr: '' };
|
||||
}
|
||||
if (_file === 'pandoc') {
|
||||
return { stdout: 'Extracted DOCX text', stderr: '' };
|
||||
}
|
||||
return { stdout: '', stderr: '' };
|
||||
});
|
||||
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
|
||||
|
||||
const result = await tool.execute({
|
||||
object_key: 'knowledge/spec.docx',
|
||||
mode: 'replace',
|
||||
});
|
||||
expect(result.success).toBe(true);
|
||||
expect(write).toHaveBeenCalledWith(
|
||||
'global/knowledge',
|
||||
expect.stringContaining('Extracted DOCX text'),
|
||||
'replace',
|
||||
);
|
||||
expect(execRunner).toHaveBeenCalledWith(
|
||||
'pandoc',
|
||||
expect.arrayContaining(['-t', 'plain']),
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
|
||||
it('allows non-text extension when force=true', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
|
||||
@@ -34,6 +34,7 @@ const TEXT_EXTENSIONS = new Set([
|
||||
]);
|
||||
const EXTRACTABLE_BINARY_EXTENSIONS = new Set([
|
||||
'.pdf',
|
||||
'.docx',
|
||||
]);
|
||||
|
||||
export interface MinioIngestDeps {
|
||||
@@ -61,6 +62,7 @@ async function readObjectText(
|
||||
objectKey: string,
|
||||
env: NodeJS.ProcessEnv,
|
||||
): Promise<string> {
|
||||
const ext = extname(objectKey).toLowerCase();
|
||||
if (!isExtractableBinaryObject(objectKey)) {
|
||||
const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 });
|
||||
return toText(stdout);
|
||||
@@ -70,8 +72,26 @@ async function readObjectText(
|
||||
const localPath = join(tempDir, 'object.bin');
|
||||
try {
|
||||
await runner('mc', ['cp', remotePath, localPath], { env, maxBuffer: 20 * 1024 * 1024 });
|
||||
const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
|
||||
return toText(stdout);
|
||||
if (ext === '.pdf') {
|
||||
const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
|
||||
return toText(stdout);
|
||||
}
|
||||
if (ext === '.docx') {
|
||||
try {
|
||||
const { stdout } = await runner('pandoc', [localPath, '-t', 'plain'], { maxBuffer: 20 * 1024 * 1024 });
|
||||
return toText(stdout);
|
||||
} catch (pandocError) {
|
||||
try {
|
||||
const { stdout } = await runner('docx2txt', [localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
|
||||
return toText(stdout);
|
||||
} catch (docx2txtError) {
|
||||
const pErr = pandocError instanceof Error ? pandocError.message : String(pandocError);
|
||||
const dErr = docx2txtError instanceof Error ? docx2txtError.message : String(docx2txtError);
|
||||
throw new Error(`Failed to extract DOCX text (pandoc/docx2txt unavailable or failed): ${pErr}; ${dErr}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new Error(`Unsupported extractable object type: ${objectKey}`);
|
||||
} finally {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
@@ -104,7 +104,7 @@ describe('createMinioSyncTool', () => {
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn(async (_file: string, args: string[]) => {
|
||||
if (args[0] === 'ls') {
|
||||
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.docx"}', stderr: '' };
|
||||
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.exe"}', stderr: '' };
|
||||
}
|
||||
return { stdout: 'fake text', stderr: '' };
|
||||
});
|
||||
@@ -117,6 +117,33 @@ describe('createMinioSyncTool', () => {
|
||||
expect(write).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('syncs DOCX objects via text extraction', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn(async (_file: string, args: string[]) => {
|
||||
if (_file === 'mc' && args[0] === 'ls') {
|
||||
return { stdout: '{"status":"success","type":"file","key":"knowledge/spec.docx"}', stderr: '' };
|
||||
}
|
||||
if (_file === 'mc' && args[0] === 'cp') {
|
||||
return { stdout: '', stderr: '' };
|
||||
}
|
||||
if (_file === 'pandoc') {
|
||||
return { stdout: 'DOCX sync text', stderr: '' };
|
||||
}
|
||||
return { stdout: '', stderr: '' };
|
||||
});
|
||||
|
||||
const tool = createMinioSyncTool(makeBackupConfig(), store, { execRunner });
|
||||
const result = await tool.execute({ prefix: 'knowledge/' });
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.output).toContain('Imported: 1');
|
||||
expect(write).toHaveBeenCalledWith(
|
||||
'global/knowledge/minio/knowledge/spec',
|
||||
expect.stringContaining('DOCX sync text'),
|
||||
'append',
|
||||
);
|
||||
});
|
||||
|
||||
it('returns an error when minio is disabled', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
|
||||
Reference in New Issue
Block a user