feat(tools): add pdf extraction for minio ingestion

This commit is contained in:
William Valentin
2026-02-16 14:33:58 -08:00
parent 63df791b26
commit e8a785b61f
7 changed files with 105 additions and 20 deletions
+3 -7
View File
@@ -12,7 +12,7 @@ type ExecRunner = (
file: string,
args: string[],
options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number },
) => Promise<{ stdout: string; stderr: string }>;
) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>;
interface MinioSyncArgs {
prefix: string;
@@ -168,13 +168,9 @@ export function createMinioSyncTool(config: BackupConfig, store: MemoryStore, de
}
const remotePath = `${alias}/${bucket}/${key}`;
const { stdout: objectStdout } = await runner('mc', ['cat', remotePath], {
env,
maxBuffer: 20 * 1024 * 1024,
});
const text = typeof objectStdout === 'string' ? objectStdout : objectStdout.toString('utf-8');
const text = await minioIngestInternals.readObjectText(runner, remotePath, key, env);
if (!force && !minioIngestInternals.isLikelyText(text)) {
if (!force && !minioIngestInternals.isExtractableBinaryObject(key) && !minioIngestInternals.isLikelyText(text)) {
skipped++;
continue;
}