From 289fc443806618db4e94c53c142e5ece1c46115c Mon Sep 17 00:00:00 2001 From: William Valentin Date: Mon, 16 Feb 2026 14:43:20 -0800 Subject: [PATCH] feat(cli): enforce minio ingestion extractor checks in setup and doctor --- README.md | 7 +++ docs/api/TOOLS.md | 5 ++ docs/plans/state.json | 19 +++++- src/cli/doctor.test.ts | 49 +++++++++++++++ src/cli/doctor.ts | 24 ++++++++ src/cli/minioExtractors.test.ts | 41 +++++++++++++ src/cli/minioExtractors.ts | 105 ++++++++++++++++++++++++++++++++ src/cli/setup.ts | 29 ++++++++- 8 files changed, 275 insertions(+), 4 deletions(-) create mode 100644 src/cli/minioExtractors.test.ts create mode 100644 src/cli/minioExtractors.ts diff --git a/README.md b/README.md index a722163..535fca9 100644 --- a/README.md +++ b/README.md @@ -630,6 +630,13 @@ When `backup.minio.enabled` is configured, Flynn also exposes MinIO tools: - `minio.ingest`: read a text-like object (plus PDF/DOCX via extractor tools when available) from MinIO and append/replace a memory namespace - `minio.sync`: recursively ingest a MinIO prefix into nested memory namespaces with object and size limits (including PDF/DOCX extraction when available) +PDF/DOCX ingestion runtime requirements: + +- PDF extraction requires `pdftotext`. +- DOCX extraction requires `pandoc` or `docx2txt`. +- `flynn setup` now checks these dependencies after config save when `backup.minio.enabled: true`. +- `flynn doctor` reports `MinIO ingest extractors` status so missing binaries are visible in health checks. + ## Kubernetes Tools Optional Kubernetes tools are available when `k8s.enabled: true`: diff --git a/docs/api/TOOLS.md b/docs/api/TOOLS.md index 771581c..fdb0241 100644 --- a/docs/api/TOOLS.md +++ b/docs/api/TOOLS.md @@ -962,6 +962,11 @@ Search memory using hybrid (keyword + vector) search. ### MinIO Tools +Runtime extractor requirements for binary document ingestion: + +- PDF (`.pdf`): requires `pdftotext` +- DOCX (`.docx`): requires either `pandoc` or `docx2txt` + #### `minio.share` Upload a local file to MinIO and return a temporary presigned download URL. diff --git a/docs/plans/state.json b/docs/plans/state.json index a606647..2852d9c 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -172,6 +172,23 @@ ], "test_status": "pnpm test:run src/tools/builtin/minio-ingest.test.ts src/tools/builtin/minio-sync.test.ts + pnpm typecheck passing" }, + "minio-ingestion-extractor-requirements-setup-and-doctor": { + "status": "completed", + "date": "2026-02-16", + "updated": "2026-02-16", + "summary": "Documented MinIO ingestion runtime extractor requirements (PDF: `pdftotext`, DOCX: `pandoc` or `docx2txt`) and added shared dependency checks in setup + doctor flows. `flynn setup` now reports extractor readiness after config save when MinIO is enabled, and `flynn doctor` now includes a `MinIO ingest extractors` check.", + "files_modified": [ + "src/cli/minioExtractors.ts", + "src/cli/minioExtractors.test.ts", + "src/cli/setup.ts", + "src/cli/doctor.ts", + "src/cli/doctor.test.ts", + "README.md", + "docs/api/TOOLS.md", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/cli/minioExtractors.test.ts src/cli/doctor.test.ts + pnpm typecheck passing" + }, "backup-session-summary-audit-trail": { "status": "completed", "date": "2026-02-16", @@ -3456,7 +3473,7 @@ } }, "overall_progress": { - "total_test_count": 1852, + "total_test_count": 1857, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", diff --git a/src/cli/doctor.test.ts b/src/cli/doctor.test.ts index cdab2db..4cab918 100644 --- a/src/cli/doctor.test.ts +++ b/src/cli/doctor.test.ts @@ -335,6 +335,55 @@ models: expect(registryCheck?.detail).toContain('unconfigured'); }); + it('reports SKIP for MinIO ingest extractors when MinIO is disabled', async () => { + mkdirSync(testDir, { recursive: true }); + const configPath = join(testDir, 'minio-disabled.yaml'); + writeFileSync(configPath, ` +telegram: + bot_token: "test-token" + allowed_chat_ids: [123] +models: + default: + provider: anthropic + model: claude-sonnet +backup: + minio: + enabled: false +`); + + const ctx: DoctorContext = { configPath, dataDir: testDir }; + const results = await runChecks(ctx); + + const minioCheck = results.find(r => r.label.includes('MinIO ingest extractors')); + expect(minioCheck?.status).toBe('skip'); + }); + + it('reports MinIO ingest extractor status when MinIO is enabled', async () => { + mkdirSync(testDir, { recursive: true }); + const configPath = join(testDir, 'minio-enabled.yaml'); + writeFileSync(configPath, ` +telegram: + bot_token: "test-token" + allowed_chat_ids: [123] +models: + default: + provider: anthropic + model: claude-sonnet +backup: + minio: + enabled: true +`); + + const ctx: DoctorContext = { configPath, dataDir: testDir }; + const results = await runChecks(ctx); + + const minioCheck = results.find(r => r.label.includes('MinIO ingest extractors')); + expect(minioCheck).toBeDefined(); + expect(['pass', 'warn']).toContain(minioCheck?.status); + expect(minioCheck?.detail).toContain('pdf:'); + expect(minioCheck?.detail).toContain('docx:'); + }); + it('reports PASS for skills registry when source is parsable', async () => { mkdirSync(testDir, { recursive: true }); const registryPath = join(testDir, 'registry.json'); diff --git a/src/cli/doctor.ts b/src/cli/doctor.ts index f1aefda..967812d 100644 --- a/src/cli/doctor.ts +++ b/src/cli/doctor.ts @@ -6,6 +6,7 @@ import { homedir } from 'os'; import { resolve, join } from 'path'; import { parse } from 'yaml'; import { configSchema } from '../config/schema.js'; +import { checkMinioExtractorStatus, summarizeMinioExtractorStatus } from './minioExtractors.js'; export interface CheckResult { status: 'pass' | 'fail' | 'warn' | 'skip'; @@ -574,6 +575,28 @@ const checkGmail: Check = async (ctx) => { return { status: warnings.length > 0 ? 'warn' : 'pass', label: 'Gmail configured', detail: withWarnings }; }; +const checkMinioExtractors: Check = async (ctx) => { + if (!ctx.config) { + return { status: 'skip', label: 'MinIO ingest extractors', detail: '(config invalid)' }; + } + + const status = await checkMinioExtractorStatus(ctx.config as unknown as Record); + if (!status.minioEnabled) { + return { status: 'skip', label: 'MinIO ingest extractors', detail: '(backup.minio not enabled)' }; + } + + const summary = summarizeMinioExtractorStatus(status); + if (status.missingRequirements.length > 0) { + return { + status: 'warn', + label: 'MinIO ingest extractors', + detail: `${summary} — install missing extractors for PDF/DOCX ingestion`, + }; + } + + return { status: 'pass', label: 'MinIO ingest extractors', detail: summary }; +}; + const allChecks: Check[] = [ checkConfigExists, checkOverlayExists, @@ -586,6 +609,7 @@ const allChecks: Check[] = [ checkModelConnectivity, checkTelegram, checkGmail, + checkMinioExtractors, checkMcpServers, checkSkills, checkSkillsRegistry, diff --git a/src/cli/minioExtractors.test.ts b/src/cli/minioExtractors.test.ts new file mode 100644 index 0000000..a5eae41 --- /dev/null +++ b/src/cli/minioExtractors.test.ts @@ -0,0 +1,41 @@ +import { describe, it, expect } from 'vitest'; +import { + checkMinioExtractorStatus, + renderMinioExtractorSetupLines, + summarizeMinioExtractorStatus, +} from './minioExtractors.js'; + +describe('minio extractor requirements', () => { + it('skips checks when backup.minio is not enabled', async () => { + const status = await checkMinioExtractorStatus({}); + + expect(status.minioEnabled).toBe(false); + expect(status.missingRequirements).toEqual([]); + expect(renderMinioExtractorSetupLines(status)).toEqual([]); + }); + + it('reports missing pdf/docx extractors', async () => { + const status = await checkMinioExtractorStatus( + { backup: { minio: { enabled: true } } }, + async () => false, + ); + + expect(status.minioEnabled).toBe(true); + expect(status.pdfSupported).toBe(false); + expect(status.docxSupported).toBe(false); + expect(status.missingRequirements).toEqual(['pdftotext', 'pandoc or docx2txt']); + expect(summarizeMinioExtractorStatus(status)).toBe('pdf:missing(pdftotext), docx:missing(pandoc|docx2txt)'); + }); + + it('accepts pandoc as docx extractor', async () => { + const status = await checkMinioExtractorStatus( + { backup: { minio: { enabled: true } } }, + async (command) => command === 'pdftotext' || command === 'pandoc', + ); + + expect(status.pdfSupported).toBe(true); + expect(status.docxSupported).toBe(true); + expect(status.availableDocxExtractors).toEqual(['pandoc']); + expect(status.missingRequirements).toEqual([]); + }); +}); diff --git a/src/cli/minioExtractors.ts b/src/cli/minioExtractors.ts new file mode 100644 index 0000000..4fbfed4 --- /dev/null +++ b/src/cli/minioExtractors.ts @@ -0,0 +1,105 @@ +import { execFile } from 'child_process'; +import { promisify } from 'util'; + +const execFileAsync = promisify(execFile); + +type UnknownRecord = Record; +type CommandExistsFn = (command: string) => Promise; + +const asRecord = (value: unknown): UnknownRecord | undefined => ( + value && typeof value === 'object' ? value as UnknownRecord : undefined +); + +async function commandExists(command: string): Promise { + try { + await execFileAsync('sh', ['-lc', `command -v ${command} >/dev/null 2>&1`]); + return true; + } catch { + return false; + } +} + +export interface MinioExtractorStatus { + minioEnabled: boolean; + pdfSupported: boolean; + docxSupported: boolean; + availableDocxExtractors: string[]; + missingRequirements: string[]; +} + +export async function checkMinioExtractorStatus( + config: Record, + exists: CommandExistsFn = commandExists, +): Promise { + const backup = asRecord(config.backup); + const minio = asRecord(backup?.minio); + const minioEnabled = minio?.enabled === true; + + if (!minioEnabled) { + return { + minioEnabled: false, + pdfSupported: false, + docxSupported: false, + availableDocxExtractors: [], + missingRequirements: [], + }; + } + + const [hasPdfToText, hasPandoc, hasDocx2Txt] = await Promise.all([ + exists('pdftotext'), + exists('pandoc'), + exists('docx2txt'), + ]); + + const availableDocxExtractors = [ + ...(hasPandoc ? ['pandoc'] : []), + ...(hasDocx2Txt ? ['docx2txt'] : []), + ]; + const pdfSupported = hasPdfToText; + const docxSupported = availableDocxExtractors.length > 0; + + const missingRequirements: string[] = []; + if (!pdfSupported) { + missingRequirements.push('pdftotext'); + } + if (!docxSupported) { + missingRequirements.push('pandoc or docx2txt'); + } + + return { + minioEnabled, + pdfSupported, + docxSupported, + availableDocxExtractors, + missingRequirements, + }; +} + +export function summarizeMinioExtractorStatus(status: MinioExtractorStatus): string { + const pdf = status.pdfSupported ? 'pdf:ok(pdftotext)' : 'pdf:missing(pdftotext)'; + const docx = status.docxSupported + ? `docx:ok(${status.availableDocxExtractors.join('|')})` + : 'docx:missing(pandoc|docx2txt)'; + return `${pdf}, ${docx}`; +} + +export function renderMinioExtractorSetupLines(status: MinioExtractorStatus): string[] { + if (!status.minioEnabled) { + return []; + } + + const lines: string[] = [ + 'MinIO ingestion extractor requirements:', + ` PDF (.pdf): pdftotext ${status.pdfSupported ? 'detected' : 'missing'}`, + ` DOCX (.docx): pandoc or docx2txt ${status.docxSupported ? `detected (${status.availableDocxExtractors.join(', ')})` : 'missing'}`, + ]; + + if (status.missingRequirements.length > 0) { + lines.push(' Missing extractors will limit PDF/DOCX ingestion for minio.ingest and minio.sync.'); + lines.push(' Install missing tools, then run `flynn doctor` to verify.'); + } else { + lines.push(' All extractor dependencies detected.'); + } + + return lines; +} diff --git a/src/cli/setup.ts b/src/cli/setup.ts index 868439c..b893745 100644 --- a/src/cli/setup.ts +++ b/src/cli/setup.ts @@ -8,6 +8,7 @@ import { createPrompter } from './setup/prompts.js'; import { ConfigBuilder } from './setup/config.js'; import { runFirstRunWizard, runMenu } from './setup/orchestrator.js'; import { runGoogleAuth } from './setup/automation.js'; +import { checkMinioExtractorStatus, renderMinioExtractorSetupLines } from './minioExtractors.js'; export async function runSetup(configPath: string): Promise { const rl = createInterface({ input: process.stdin, output: process.stdout }); @@ -21,12 +22,16 @@ export async function runSetup(configPath: string): Promise { const builder = ConfigBuilder.fromObject(parsed); await runMenu(p, builder); saveConfig(configPath, builder, p); - await runGoogleAuth(p, builder.build()); + const config = builder.build(); + await printMinioExtractorSetupStatus(p, config as Record); + await runGoogleAuth(p, config); } else { // No config → first-run wizard const builder = await runFirstRunWizard(p); saveConfig(configPath, builder, p); - await runGoogleAuth(p, builder.build()); + const config = builder.build(); + await printMinioExtractorSetupStatus(p, config as Record); + await runGoogleAuth(p, config); const shouldStart = await p.confirm('Start Flynn now?', true); if (shouldStart) { @@ -46,7 +51,9 @@ export async function runSetup(configPath: string): Promise { const menuBuilder = ConfigBuilder.fromObject(parsed); await runMenu(p, menuBuilder); saveConfig(configPath, menuBuilder, p); - await runGoogleAuth(p, menuBuilder.build()); + const config = menuBuilder.build(); + await printMinioExtractorSetupStatus(p, config as Record); + await runGoogleAuth(p, config); } } } finally { @@ -62,6 +69,22 @@ function saveConfig(configPath: string, builder: ConfigBuilder, p: { println(msg p.println(`✓ Config saved to ${configPath}`); } +async function printMinioExtractorSetupStatus( + p: { println(msg?: string): void }, + config: Record, +): Promise { + const status = await checkMinioExtractorStatus(config); + const lines = renderMinioExtractorSetupLines(status); + if (lines.length === 0) { + return; + } + + p.println(); + for (const line of lines) { + p.println(line); + } +} + export function registerSetupCommand(program: Command): void { program .command('setup')