feat(tools): add pdf extraction for minio ingestion

This commit is contained in:
William Valentin
2026-02-16 14:33:58 -08:00
parent 63df791b26
commit e8a785b61f
7 changed files with 105 additions and 20 deletions
+2 -2
View File
@@ -627,8 +627,8 @@ backup:
When `backup.minio.enabled` is configured, Flynn also exposes MinIO tools:
- `minio.share`: upload a local file to the configured MinIO bucket and return a temporary download URL (`mc share download`)
- `minio.ingest`: read a text-like object from MinIO and append/replace a memory namespace (useful for syncing notes/runbooks into long-term memory)
- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits
- `minio.ingest`: read a text-like object (and PDF via `pdftotext` when available) from MinIO and append/replace a memory namespace
- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits (including PDF extraction when available)
## Kubernetes Tools
+2 -2
View File
@@ -968,7 +968,7 @@ Upload a local file to MinIO and return a temporary presigned download URL.
#### `minio.ingest`
Read a text-like object from MinIO and write it into a memory namespace.
Read a text-like object from MinIO (and PDFs when `pdftotext` is available) and write it into a memory namespace.
```json
{
@@ -1010,7 +1010,7 @@ Read a text-like object from MinIO and write it into a memory namespace.
#### `minio.sync`
Sync text-like objects from a MinIO prefix into nested memory namespaces.
Sync text-like objects from a MinIO prefix into nested memory namespaces (with PDF extraction when available).
```json
{
+16
View File
@@ -141,6 +141,22 @@
],
"test_status": "pnpm test:run src/tools/builtin/k8s.test.ts src/tools/policy.test.ts src/config/schema.test.ts + pnpm typecheck passing"
},
"minio-pdf-ingestion-support": {
"status": "completed",
"date": "2026-02-16",
"updated": "2026-02-16",
"summary": "Extended MinIO knowledge ingestion to support PDF documents via `pdftotext` extraction in both `minio.ingest` and `minio.sync` paths (when available), while preserving text-safety checks for other binary formats. Updated tests and docs accordingly.",
"files_modified": [
"src/tools/builtin/minio-ingest.ts",
"src/tools/builtin/minio-sync.ts",
"src/tools/builtin/minio-ingest.test.ts",
"src/tools/builtin/minio-sync.test.ts",
"README.md",
"docs/api/TOOLS.md",
"docs/plans/state.json"
],
"test_status": "pnpm test:run src/tools/builtin/minio-ingest.test.ts src/tools/builtin/minio-sync.test.ts + pnpm typecheck passing"
},
"backup-session-summary-audit-trail": {
"status": "completed",
"date": "2026-02-16",
+40 -3
View File
@@ -31,10 +31,11 @@ describe('minio ingest internals', () => {
it('accepts known text-like extensions', () => {
expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true);
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true);
});
it('rejects likely binary extensions', () => {
expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(false);
expect(minioIngestInternals.isLikelyTextObject('manual.exe')).toBe(false);
});
});
@@ -72,18 +73,54 @@ describe('createMinioIngestTool', () => {
);
});
it('rejects likely binary object unless force=true', async () => {
it('rejects unsupported binary object unless force=true', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;
const execRunner = vi.fn();
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
const result = await tool.execute({ object_key: 'knowledge/diagram.pdf' });
const result = await tool.execute({ object_key: 'knowledge/diagram.docx' });
expect(result.success).toBe(false);
expect(result.error).toContain('Unsupported object type');
expect(execRunner).not.toHaveBeenCalled();
});
it('extracts PDF text with pdftotext', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;
const execRunner = vi.fn(async (_file: string, args: string[]) => {
if (args[0] === 'cp') {
return { stdout: '', stderr: '' };
}
if (_file === 'pdftotext') {
return { stdout: 'Extracted PDF text', stderr: '' };
}
return { stdout: '', stderr: '' };
});
const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner });
const result = await tool.execute({
object_key: 'knowledge/diagram.pdf',
mode: 'replace',
});
expect(result.success).toBe(true);
expect(write).toHaveBeenCalledWith(
'global/knowledge',
expect.stringContaining('Extracted PDF text'),
'replace',
);
expect(execRunner).toHaveBeenCalledWith(
'mc',
expect.arrayContaining(['cp', 'flynningest/flynn-knowledge/knowledge/diagram.pdf']),
expect.objectContaining({ env: expect.any(Object) }),
);
expect(execRunner).toHaveBeenCalledWith(
'pdftotext',
expect.arrayContaining(['-q']),
expect.any(Object),
);
});
it('allows non-text extension when force=true', async () => {
const write = vi.fn();
const store = { write } as unknown as MemoryStore;
+41 -5
View File
@@ -1,6 +1,9 @@
import { promisify } from 'node:util';
import { execFile } from 'node:child_process';
import { extname } from 'node:path';
import { mkdtempSync, rmSync } from 'node:fs';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import type { BackupConfig } from '../../config/schema.js';
import type { MemoryStore } from '../../memory/store.js';
import type { Tool, ToolResult } from '../types.js';
@@ -12,7 +15,7 @@ type ExecRunner = (
file: string,
args: string[],
options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number },
) => Promise<{ stdout: string; stderr: string }>;
) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>;
const TEXT_EXTENSIONS = new Set([
'.txt',
@@ -29,6 +32,9 @@ const TEXT_EXTENSIONS = new Set([
'.html',
'.htm',
]);
const EXTRACTABLE_BINARY_EXTENSIONS = new Set([
'.pdf',
]);
export interface MinioIngestDeps {
execRunner?: ExecRunner;
@@ -42,12 +48,40 @@ function isLikelyText(content: string): boolean {
function isLikelyTextObject(objectKey: string): boolean {
const ext = extname(objectKey).toLowerCase();
if (!ext) {return true;}
return TEXT_EXTENSIONS.has(ext);
return TEXT_EXTENSIONS.has(ext) || EXTRACTABLE_BINARY_EXTENSIONS.has(ext);
}
function isExtractableBinaryObject(objectKey: string): boolean {
return EXTRACTABLE_BINARY_EXTENSIONS.has(extname(objectKey).toLowerCase());
}
async function readObjectText(
runner: ExecRunner,
remotePath: string,
objectKey: string,
env: NodeJS.ProcessEnv,
): Promise<string> {
if (!isExtractableBinaryObject(objectKey)) {
const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 });
return toText(stdout);
}
const tempDir = mkdtempSync(join(tmpdir(), 'flynn-minio-ingest-'));
const localPath = join(tempDir, 'object.bin');
try {
await runner('mc', ['cp', remotePath, localPath], { env, maxBuffer: 20 * 1024 * 1024 });
const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
return toText(stdout);
} finally {
rmSync(tempDir, { recursive: true, force: true });
}
}
export const minioIngestInternals = {
isLikelyText,
isLikelyTextObject,
isExtractableBinaryObject,
readObjectText,
};
interface MinioIngestArgs {
@@ -139,10 +173,9 @@ export function createMinioIngestTool(config: BackupConfig, store: MemoryStore,
const remotePath = `${alias}/${bucket}/${objectKey}`;
try {
const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 });
const text = typeof stdout === 'string' ? stdout : stdout.toString('utf-8');
const text = await readObjectText(runner, remotePath, objectKey, env);
if (!force && !isLikelyText(text)) {
if (!force && !isExtractableBinaryObject(objectKey) && !isLikelyText(text)) {
return {
success: false,
output: '',
@@ -180,3 +213,6 @@ export function createMinioIngestTool(config: BackupConfig, store: MemoryStore,
},
};
}
function toText(value: string | Buffer): string {
return typeof value === 'string' ? value : value.toString('utf-8');
}
+1 -1
View File
@@ -104,7 +104,7 @@ describe('createMinioSyncTool', () => {
const store = { write } as unknown as MemoryStore;
const execRunner = vi.fn(async (_file: string, args: string[]) => {
if (args[0] === 'ls') {
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.pdf"}', stderr: '' };
return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.docx"}', stderr: '' };
}
return { stdout: 'fake text', stderr: '' };
});
+3 -7
View File
@@ -12,7 +12,7 @@ type ExecRunner = (
file: string,
args: string[],
options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number },
) => Promise<{ stdout: string; stderr: string }>;
) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>;
interface MinioSyncArgs {
prefix: string;
@@ -168,13 +168,9 @@ export function createMinioSyncTool(config: BackupConfig, store: MemoryStore, de
}
const remotePath = `${alias}/${bucket}/${key}`;
const { stdout: objectStdout } = await runner('mc', ['cat', remotePath], {
env,
maxBuffer: 20 * 1024 * 1024,
});
const text = typeof objectStdout === 'string' ? objectStdout : objectStdout.toString('utf-8');
const text = await minioIngestInternals.readObjectText(runner, remotePath, key, env);
if (!force && !minioIngestInternals.isLikelyText(text)) {
if (!force && !minioIngestInternals.isExtractableBinaryObject(key) && !minioIngestInternals.isLikelyText(text)) {
skipped++;
continue;
}