diff --git a/README.md b/README.md index 48254c3..99de387 100644 --- a/README.md +++ b/README.md @@ -627,8 +627,8 @@ backup: When `backup.minio.enabled` is configured, Flynn also exposes MinIO tools: - `minio.share`: upload a local file to the configured MinIO bucket and return a temporary download URL (`mc share download`) -- `minio.ingest`: read a text-like object from MinIO and append/replace a memory namespace (useful for syncing notes/runbooks into long-term memory) -- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits +- `minio.ingest`: read a text-like object (and PDF via `pdftotext` when available) from MinIO and append/replace a memory namespace +- `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits (including PDF extraction when available) ## Kubernetes Tools diff --git a/docs/api/TOOLS.md b/docs/api/TOOLS.md index 45bd82c..ba639b6 100644 --- a/docs/api/TOOLS.md +++ b/docs/api/TOOLS.md @@ -968,7 +968,7 @@ Upload a local file to MinIO and return a temporary presigned download URL. #### `minio.ingest` -Read a text-like object from MinIO and write it into a memory namespace. +Read a text-like object from MinIO (and PDFs when `pdftotext` is available) and write it into a memory namespace. ```json { @@ -1010,7 +1010,7 @@ Read a text-like object from MinIO and write it into a memory namespace. #### `minio.sync` -Sync text-like objects from a MinIO prefix into nested memory namespaces. +Sync text-like objects from a MinIO prefix into nested memory namespaces (with PDF extraction when available). ```json { diff --git a/docs/plans/state.json b/docs/plans/state.json index 489fb5a..c98647b 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -141,6 +141,22 @@ ], "test_status": "pnpm test:run src/tools/builtin/k8s.test.ts src/tools/policy.test.ts src/config/schema.test.ts + pnpm typecheck passing" }, + "minio-pdf-ingestion-support": { + "status": "completed", + "date": "2026-02-16", + "updated": "2026-02-16", + "summary": "Extended MinIO knowledge ingestion to support PDF documents via `pdftotext` extraction in both `minio.ingest` and `minio.sync` paths (when available), while preserving text-safety checks for other binary formats. Updated tests and docs accordingly.", + "files_modified": [ + "src/tools/builtin/minio-ingest.ts", + "src/tools/builtin/minio-sync.ts", + "src/tools/builtin/minio-ingest.test.ts", + "src/tools/builtin/minio-sync.test.ts", + "README.md", + "docs/api/TOOLS.md", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/tools/builtin/minio-ingest.test.ts src/tools/builtin/minio-sync.test.ts + pnpm typecheck passing" + }, "backup-session-summary-audit-trail": { "status": "completed", "date": "2026-02-16", diff --git a/src/tools/builtin/minio-ingest.test.ts b/src/tools/builtin/minio-ingest.test.ts index 840a7d2..604bce4 100644 --- a/src/tools/builtin/minio-ingest.test.ts +++ b/src/tools/builtin/minio-ingest.test.ts @@ -31,10 +31,11 @@ describe('minio ingest internals', () => { it('accepts known text-like extensions', () => { expect(minioIngestInternals.isLikelyTextObject('notes/today.md')).toBe(true); expect(minioIngestInternals.isLikelyTextObject('logs/daemon.log')).toBe(true); + expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(true); }); it('rejects likely binary extensions', () => { - expect(minioIngestInternals.isLikelyTextObject('manual.pdf')).toBe(false); + expect(minioIngestInternals.isLikelyTextObject('manual.exe')).toBe(false); }); }); @@ -72,18 +73,54 @@ describe('createMinioIngestTool', () => { ); }); - it('rejects likely binary object unless force=true', async () => { + it('rejects unsupported binary object unless force=true', async () => { const write = vi.fn(); const store = { write } as unknown as MemoryStore; const execRunner = vi.fn(); const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner }); - const result = await tool.execute({ object_key: 'knowledge/diagram.pdf' }); + const result = await tool.execute({ object_key: 'knowledge/diagram.docx' }); expect(result.success).toBe(false); expect(result.error).toContain('Unsupported object type'); expect(execRunner).not.toHaveBeenCalled(); }); + it('extracts PDF text with pdftotext', async () => { + const write = vi.fn(); + const store = { write } as unknown as MemoryStore; + const execRunner = vi.fn(async (_file: string, args: string[]) => { + if (args[0] === 'cp') { + return { stdout: '', stderr: '' }; + } + if (_file === 'pdftotext') { + return { stdout: 'Extracted PDF text', stderr: '' }; + } + return { stdout: '', stderr: '' }; + }); + const tool = createMinioIngestTool(makeBackupConfig(), store, { execRunner }); + + const result = await tool.execute({ + object_key: 'knowledge/diagram.pdf', + mode: 'replace', + }); + expect(result.success).toBe(true); + expect(write).toHaveBeenCalledWith( + 'global/knowledge', + expect.stringContaining('Extracted PDF text'), + 'replace', + ); + expect(execRunner).toHaveBeenCalledWith( + 'mc', + expect.arrayContaining(['cp', 'flynningest/flynn-knowledge/knowledge/diagram.pdf']), + expect.objectContaining({ env: expect.any(Object) }), + ); + expect(execRunner).toHaveBeenCalledWith( + 'pdftotext', + expect.arrayContaining(['-q']), + expect.any(Object), + ); + }); + it('allows non-text extension when force=true', async () => { const write = vi.fn(); const store = { write } as unknown as MemoryStore; diff --git a/src/tools/builtin/minio-ingest.ts b/src/tools/builtin/minio-ingest.ts index 19ec287..0a03013 100644 --- a/src/tools/builtin/minio-ingest.ts +++ b/src/tools/builtin/minio-ingest.ts @@ -1,6 +1,9 @@ import { promisify } from 'node:util'; import { execFile } from 'node:child_process'; import { extname } from 'node:path'; +import { mkdtempSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; import type { BackupConfig } from '../../config/schema.js'; import type { MemoryStore } from '../../memory/store.js'; import type { Tool, ToolResult } from '../types.js'; @@ -12,7 +15,7 @@ type ExecRunner = ( file: string, args: string[], options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number }, -) => Promise<{ stdout: string; stderr: string }>; +) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>; const TEXT_EXTENSIONS = new Set([ '.txt', @@ -29,6 +32,9 @@ const TEXT_EXTENSIONS = new Set([ '.html', '.htm', ]); +const EXTRACTABLE_BINARY_EXTENSIONS = new Set([ + '.pdf', +]); export interface MinioIngestDeps { execRunner?: ExecRunner; @@ -42,12 +48,40 @@ function isLikelyText(content: string): boolean { function isLikelyTextObject(objectKey: string): boolean { const ext = extname(objectKey).toLowerCase(); if (!ext) {return true;} - return TEXT_EXTENSIONS.has(ext); + return TEXT_EXTENSIONS.has(ext) || EXTRACTABLE_BINARY_EXTENSIONS.has(ext); +} + +function isExtractableBinaryObject(objectKey: string): boolean { + return EXTRACTABLE_BINARY_EXTENSIONS.has(extname(objectKey).toLowerCase()); +} + +async function readObjectText( + runner: ExecRunner, + remotePath: string, + objectKey: string, + env: NodeJS.ProcessEnv, +): Promise { + if (!isExtractableBinaryObject(objectKey)) { + const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 }); + return toText(stdout); + } + + const tempDir = mkdtempSync(join(tmpdir(), 'flynn-minio-ingest-')); + const localPath = join(tempDir, 'object.bin'); + try { + await runner('mc', ['cp', remotePath, localPath], { env, maxBuffer: 20 * 1024 * 1024 }); + const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 }); + return toText(stdout); + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } } export const minioIngestInternals = { isLikelyText, isLikelyTextObject, + isExtractableBinaryObject, + readObjectText, }; interface MinioIngestArgs { @@ -139,10 +173,9 @@ export function createMinioIngestTool(config: BackupConfig, store: MemoryStore, const remotePath = `${alias}/${bucket}/${objectKey}`; try { - const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 }); - const text = typeof stdout === 'string' ? stdout : stdout.toString('utf-8'); + const text = await readObjectText(runner, remotePath, objectKey, env); - if (!force && !isLikelyText(text)) { + if (!force && !isExtractableBinaryObject(objectKey) && !isLikelyText(text)) { return { success: false, output: '', @@ -180,3 +213,6 @@ export function createMinioIngestTool(config: BackupConfig, store: MemoryStore, }, }; } +function toText(value: string | Buffer): string { + return typeof value === 'string' ? value : value.toString('utf-8'); +} diff --git a/src/tools/builtin/minio-sync.test.ts b/src/tools/builtin/minio-sync.test.ts index ae109dc..c8fca2b 100644 --- a/src/tools/builtin/minio-sync.test.ts +++ b/src/tools/builtin/minio-sync.test.ts @@ -104,7 +104,7 @@ describe('createMinioSyncTool', () => { const store = { write } as unknown as MemoryStore; const execRunner = vi.fn(async (_file: string, args: string[]) => { if (args[0] === 'ls') { - return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.pdf"}', stderr: '' }; + return { stdout: '{"status":"success","type":"file","key":"knowledge/diagram.docx"}', stderr: '' }; } return { stdout: 'fake text', stderr: '' }; }); diff --git a/src/tools/builtin/minio-sync.ts b/src/tools/builtin/minio-sync.ts index 5033a91..51c3a33 100644 --- a/src/tools/builtin/minio-sync.ts +++ b/src/tools/builtin/minio-sync.ts @@ -12,7 +12,7 @@ type ExecRunner = ( file: string, args: string[], options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number }, -) => Promise<{ stdout: string; stderr: string }>; +) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>; interface MinioSyncArgs { prefix: string; @@ -168,13 +168,9 @@ export function createMinioSyncTool(config: BackupConfig, store: MemoryStore, de } const remotePath = `${alias}/${bucket}/${key}`; - const { stdout: objectStdout } = await runner('mc', ['cat', remotePath], { - env, - maxBuffer: 20 * 1024 * 1024, - }); - const text = typeof objectStdout === 'string' ? objectStdout : objectStdout.toString('utf-8'); + const text = await minioIngestInternals.readObjectText(runner, remotePath, key, env); - if (!force && !minioIngestInternals.isLikelyText(text)) { + if (!force && !minioIngestInternals.isExtractableBinaryObject(key) && !minioIngestInternals.isLikelyText(text)) { skipped++; continue; }