feat(tools): add pdf extraction for minio ingestion
This commit is contained in:
@@ -31,10 +31,11 @@ describe('minio ingest internals', () => {
|
||||
it('accepts known text-like extensions', () => {
|
||||
expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true);
|
||||
expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true);
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects likely binary extensions', () => {
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(false);
|
||||
expect(minioIngestInternals.isLikelyTextObject('manual.exe')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -72,18 +73,54 @@ describe('createMinioIngestTool', () => {
|
||||
);
|
||||
});
|
||||
|
||||
it('rejects likely binary object unless force=true', async () => {
|
||||
it('rejects unsupported binary object unless force=true', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn();
|
||||
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
|
||||
|
||||
const result = await tool.execute({ object_key: 'knowledge/diagram.pdf' });
|
||||
const result = await tool.execute({ object_key: 'knowledge/diagram.docx' });
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain('Unsupported object type');
|
||||
expect(execRunner).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('extracts PDF text with pdftotext', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn(async (_file: string, args: string[]) => {
|
||||
if (args[0] === 'cp') {
|
||||
return { stdout: '', stderr: '' };
|
||||
}
|
||||
if (_file === 'pdftotext') {
|
||||
return { stdout: 'Extracted PDF text', stderr: '' };
|
||||
}
|
||||
return { stdout: '', stderr: '' };
|
||||
});
|
||||
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
|
||||
|
||||
const result = await tool.execute({
|
||||
object_key: 'knowledge/diagram.pdf',
|
||||
mode: 'replace',
|
||||
});
|
||||
expect(result.success).toBe(true);
|
||||
expect(write).toHaveBeenCalledWith(
|
||||
'global/knowledge',
|
||||
expect.stringContaining('Extracted PDF text'),
|
||||
'replace',
|
||||
);
|
||||
expect(execRunner).toHaveBeenCalledWith(
|
||||
'mc',
|
||||
expect.arrayContaining(['cp', 'flynningest/flynn-knowledge/knowledge/diagram.pdf']),
|
||||
expect.objectContaining({ env: expect.any(Object) }),
|
||||
);
|
||||
expect(execRunner).toHaveBeenCalledWith(
|
||||
'pdftotext',
|
||||
expect.arrayContaining(['-q']),
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
|
||||
it('allows non-text extension when force=true', async () => {
|
||||
const write = vi.fn();
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import { promisify } from 'node:util';
|
||||
import { execFile } from 'node:child_process';
|
||||
import { extname } from 'node:path';
|
||||
import { mkdtempSync, rmSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { tmpdir } from 'node:os';
|
||||
import type { BackupConfig } from '../../config/schema.js';
|
||||
import type { MemoryStore } from '../../memory/store.js';
|
||||
import type { Tool, ToolResult } from '../types.js';
|
||||
@@ -12,7 +15,7 @@ type ExecRunner = (
|
||||
file: string,
|
||||
args: string[],
|
||||
options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number },
|
||||
) => Promise<{ stdout: string; stderr: string }>;
|
||||
) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>;
|
||||
|
||||
const TEXT_EXTENSIONS = new Set([
|
||||
'.txt',
|
||||
@@ -29,6 +32,9 @@ const TEXT_EXTENSIONS = new Set([
|
||||
'.html',
|
||||
'.htm',
|
||||
]);
|
||||
const EXTRACTABLE_BINARY_EXTENSIONS = new Set([
|
||||
'.pdf',
|
||||
]);
|
||||
|
||||
export interface MinioIngestDeps {
|
||||
execRunner?: ExecRunner;
|
||||
@@ -42,12 +48,40 @@ function isLikelyText(content: string): boolean {
|
||||
function isLikelyTextObject(objectKey: string): boolean {
|
||||
const ext = extname(objectKey).toLowerCase();
|
||||
if (!ext) {return true;}
|
||||
return TEXT_EXTENSIONS.has(ext);
|
||||
return TEXT_EXTENSIONS.has(ext) || EXTRACTABLE_BINARY_EXTENSIONS.has(ext);
|
||||
}
|
||||
|
||||
function isExtractableBinaryObject(objectKey: string): boolean {
|
||||
return EXTRACTABLE_BINARY_EXTENSIONS.has(extname(objectKey).toLowerCase());
|
||||
}
|
||||
|
||||
async function readObjectText(
|
||||
runner: ExecRunner,
|
||||
remotePath: string,
|
||||
objectKey: string,
|
||||
env: NodeJS.ProcessEnv,
|
||||
): Promise<string> {
|
||||
if (!isExtractableBinaryObject(objectKey)) {
|
||||
const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 });
|
||||
return toText(stdout);
|
||||
}
|
||||
|
||||
const tempDir = mkdtempSync(join(tmpdir(), 'flynn-minio-ingest-'));
|
||||
const localPath = join(tempDir, 'object.bin');
|
||||
try {
|
||||
await runner('mc', ['cp', remotePath, localPath], { env, maxBuffer: 20 * 1024 * 1024 });
|
||||
const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
|
||||
return toText(stdout);
|
||||
} finally {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
export const minioIngestInternals = {
|
||||
isLikelyText,
|
||||
isLikelyTextObject,
|
||||
isExtractableBinaryObject,
|
||||
readObjectText,
|
||||
};
|
||||
|
||||
interface MinioIngestArgs {
|
||||
@@ -139,10 +173,9 @@ export function createMinioIngestTool(config: BackupConfig, store: MemoryStore,
|
||||
const remotePath = `${alias}/${bucket}/${objectKey}`;
|
||||
|
||||
try {
|
||||
const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 });
|
||||
const text = typeof stdout === 'string' ? stdout : stdout.toString('utf-8');
|
||||
const text = await readObjectText(runner, remotePath, objectKey, env);
|
||||
|
||||
if (!force && !isLikelyText(text)) {
|
||||
if (!force && !isExtractableBinaryObject(objectKey) && !isLikelyText(text)) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
@@ -180,3 +213,6 @@ export function createMinioIngestTool(config: BackupConfig, store: MemoryStore,
|
||||
},
|
||||
};
|
||||
}
|
||||
function toText(value: string | Buffer): string {
|
||||
return typeof value === 'string' ? value : value.toString('utf-8');
|
||||
}
|
||||
|
||||
@@ -104,7 +104,7 @@ describe('createMinioSyncTool', () => {
|
||||
const store = { write } as unknown as MemoryStore;
|
||||
const execRunner = vi.fn(async (_file: string, args: string[]) => {
|
||||
if (args[0] === 'ls') {
|
||||
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.pdf"}', stderr: '' };
|
||||
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.docx"}', stderr: '' };
|
||||
}
|
||||
return { stdout: 'fake text', stderr: '' };
|
||||
});
|
||||
|
||||
@@ -12,7 +12,7 @@ type ExecRunner = (
|
||||
file: string,
|
||||
args: string[],
|
||||
options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number },
|
||||
) => Promise<{ stdout: string; stderr: string }>;
|
||||
) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>;
|
||||
|
||||
interface MinioSyncArgs {
|
||||
prefix: string;
|
||||
@@ -168,13 +168,9 @@ export function createMinioSyncTool(config: BackupConfig, store: MemoryStore, de
|
||||
}
|
||||
|
||||
const remotePath = `${alias}/${bucket}/${key}`;
|
||||
const { stdout: objectStdout } = await runner('mc', ['cat', remotePath], {
|
||||
env,
|
||||
maxBuffer: 20 * 1024 * 1024,
|
||||
});
|
||||
const text = typeof objectStdout === 'string' ? objectStdout : objectStdout.toString('utf-8');
|
||||
const text = await minioIngestInternals.readObjectText(runner, remotePath, key, env);
|
||||
|
||||
if (!force && !minioIngestInternals.isLikelyText(text)) {
|
||||
if (!force && !minioIngestInternals.isExtractableBinaryObject(key) && !minioIngestInternals.isLikelyText(text)) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user