feat(tools): add pdf extraction for minio ingestion
This commit is contained in:
@@ -31,10 +31,11 @@ describe('minio ingest internals', () => {
|
||||
it('accepts known text-like extensions', () => {
|
||||
expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true);
|
||||
expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true);
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects likely binary extensions', () => {
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(false);
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.exe')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -72,18 +73,54 @@ describe('createMinioIngestTool', () => {
|
||||
);
|
||||
});
|
||||
|
||||
it('rejects likely binary object unless force=true', async () => {
|
||||
it('rejects unsupported binary object unless force=true', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn();
|
||||
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
|
||||
|
||||
const result = await tool.execute({ object_key: 'knowledge/diagram.pdf' });
|
||||
const result = await tool.execute({ object_key: 'knowledge/diagram.docx' });
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain('Unsupported object type');
|
||||
expect(execRunner).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('extracts PDF text with pdftotext', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn(async (_file: string, args: string[]) => {
|
||||
if (args[0] === 'cp') {
|
||||
return { stdout: '', stderr: '' };
|
||||
}
|
||||
if (_file === 'pdftotext') {
|
||||
return { stdout: 'Extracted PDF text', stderr: '' };
|
||||
}
|
||||
return { stdout: '', stderr: '' };
|
||||
});
|
||||
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
|
||||
|
||||
const result = await tool.execute({
|
||||
object_key: 'knowledge/diagram.pdf',
|
||||
mode: 'replace',
|
||||
});
|
||||
expect(result.success).toBe(true);
|
||||
expect(write).toHaveBeenCalledWith(
|
||||
'global/knowledge',
|
||||
expect.stringContaining('Extracted PDF text'),
|
||||
'replace',
|
||||
);
|
||||
expect(execRunner).toHaveBeenCalledWith(
|
||||
'mc',
|
||||
expect.arrayContaining(['cp', 'flynningest/flynn-knowledge/knowledge/diagram.pdf']),
|
||||
expect.objectContaining({ env: expect.any(Object) }),
|
||||
);
|
||||
expect(execRunner).toHaveBeenCalledWith(
|
||||
'pdftotext',
|
||||
expect.arrayContaining(['-q']),
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
|
||||
it('allows non-text extension when force=true', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
|
||||
Reference in New Issue
Block a user