219 lines
6.9 KiB
TypeScript
219 lines
6.9 KiB
TypeScript
import { promisify } from 'node:util';
|
|
import { execFile } from 'node:child_process';
|
|
import { extname } from 'node:path';
|
|
import { mkdtempSync, rmSync } from 'node:fs';
|
|
import { join } from 'node:path';
|
|
import { tmpdir } from 'node:os';
|
|
import type { BackupConfig } from '../../config/schema.js';
|
|
import type { MemoryStore } from '../../memory/store.js';
|
|
import type { Tool, ToolResult } from '../types.js';
|
|
import { backupInternals } from '../../backup/index.js';
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
type ExecRunner = (
|
|
file: string,
|
|
args: string[],
|
|
options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number },
|
|
) => Promise<{ stdout: string | Buffer; stderr: string | Buffer }>;
|
|
|
|
const TEXT_EXTENSIONS = new Set([
|
|
'.txt',
|
|
'.md',
|
|
'.markdown',
|
|
'.csv',
|
|
'.tsv',
|
|
'.json',
|
|
'.jsonl',
|
|
'.yaml',
|
|
'.yml',
|
|
'.log',
|
|
'.xml',
|
|
'.html',
|
|
'.htm',
|
|
]);
|
|
const EXTRACTABLE_BINARY_EXTENSIONS = new Set([
|
|
'.pdf',
|
|
]);
|
|
|
|
export interface MinioIngestDeps {
|
|
execRunner?: ExecRunner;
|
|
now?: () => Date;
|
|
}
|
|
|
|
function isLikelyText(content: string): boolean {
|
|
return !content.includes('\u0000');
|
|
}
|
|
|
|
function isLikelyTextObject(objectKey: string): boolean {
|
|
const ext = extname(objectKey).toLowerCase();
|
|
if (!ext) {return true;}
|
|
return TEXT_EXTENSIONS.has(ext) || EXTRACTABLE_BINARY_EXTENSIONS.has(ext);
|
|
}
|
|
|
|
function isExtractableBinaryObject(objectKey: string): boolean {
|
|
return EXTRACTABLE_BINARY_EXTENSIONS.has(extname(objectKey).toLowerCase());
|
|
}
|
|
|
|
async function readObjectText(
|
|
runner: ExecRunner,
|
|
remotePath: string,
|
|
objectKey: string,
|
|
env: NodeJS.ProcessEnv,
|
|
): Promise<string> {
|
|
if (!isExtractableBinaryObject(objectKey)) {
|
|
const { stdout } = await runner('mc', ['cat', remotePath], { env, maxBuffer: 20 * 1024 * 1024 });
|
|
return toText(stdout);
|
|
}
|
|
|
|
const tempDir = mkdtempSync(join(tmpdir(), 'flynn-minio-ingest-'));
|
|
const localPath = join(tempDir, 'object.bin');
|
|
try {
|
|
await runner('mc', ['cp', remotePath, localPath], { env, maxBuffer: 20 * 1024 * 1024 });
|
|
const { stdout } = await runner('pdftotext', ['-q', localPath, '-'], { maxBuffer: 20 * 1024 * 1024 });
|
|
return toText(stdout);
|
|
} finally {
|
|
rmSync(tempDir, { recursive: true, force: true });
|
|
}
|
|
}
|
|
|
|
export const minioIngestInternals = {
|
|
isLikelyText,
|
|
isLikelyTextObject,
|
|
isExtractableBinaryObject,
|
|
readObjectText,
|
|
};
|
|
|
|
interface MinioIngestArgs {
|
|
object_key: string;
|
|
bucket?: string;
|
|
namespace?: string;
|
|
mode?: 'append' | 'replace';
|
|
max_chars?: number;
|
|
force?: boolean;
|
|
}
|
|
|
|
export function createMinioIngestTool(config: BackupConfig, store: MemoryStore, deps?: MinioIngestDeps): Tool {
|
|
return {
|
|
name: 'minio.ingest',
|
|
description: 'Read a text-like object from MinIO and ingest it into memory namespace for later retrieval/search.',
|
|
inputSchema: {
|
|
type: 'object',
|
|
properties: {
|
|
object_key: {
|
|
type: 'string',
|
|
description: 'Object key in MinIO bucket (for example: "knowledge/runbook.md")',
|
|
},
|
|
bucket: {
|
|
type: 'string',
|
|
description: 'Optional bucket override. Defaults to backup.minio.bucket.',
|
|
},
|
|
namespace: {
|
|
type: 'string',
|
|
description: 'Memory namespace to write to. Default: "global/knowledge".',
|
|
},
|
|
mode: {
|
|
type: 'string',
|
|
enum: ['append', 'replace'],
|
|
description: 'Write mode for memory namespace. Default: "append".',
|
|
},
|
|
max_chars: {
|
|
type: 'number',
|
|
description: 'Maximum characters to ingest. Default: 20000.',
|
|
},
|
|
force: {
|
|
type: 'boolean',
|
|
description: 'Ingest even if file extension/content look non-text.',
|
|
},
|
|
},
|
|
required: ['object_key'],
|
|
},
|
|
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
|
const args = rawArgs as MinioIngestArgs;
|
|
const minio = config.minio;
|
|
const objectKey = args.object_key?.trim();
|
|
const namespace = args.namespace ?? 'global/knowledge';
|
|
const mode = args.mode ?? 'append';
|
|
const maxChars = Math.max(1, Math.floor(args.max_chars ?? 20_000));
|
|
const force = args.force ?? false;
|
|
const bucket = args.bucket ?? minio.bucket;
|
|
|
|
if (!objectKey) {
|
|
return { success: false, output: '', error: 'object_key is required' };
|
|
}
|
|
if (!minio.enabled) {
|
|
return { success: false, output: '', error: 'MinIO ingestion requires backup.minio.enabled=true' };
|
|
}
|
|
if (!minio.endpoint || !minio.access_key || !minio.secret_key || !bucket) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: 'Missing MinIO credentials in backup.minio (endpoint/access_key/secret_key/bucket)',
|
|
};
|
|
}
|
|
if (!force && !isLikelyTextObject(objectKey)) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: `Unsupported object type for ingestion: ${objectKey}. Use force=true if you know it is text.`,
|
|
};
|
|
}
|
|
|
|
const alias = 'flynningest';
|
|
const host = backupInternals.buildMinioHost({
|
|
endpoint: minio.endpoint,
|
|
accessKey: minio.access_key,
|
|
secretKey: minio.secret_key,
|
|
secure: minio.secure,
|
|
});
|
|
const env = { ...process.env, [`MC_HOST_${alias}`]: host };
|
|
const runner = deps?.execRunner ?? (async (file: string, cmdArgs: string[], options?: { env?: NodeJS.ProcessEnv; maxBuffer?: number }) => {
|
|
return execFileAsync(file, cmdArgs, options);
|
|
});
|
|
const remotePath = `${alias}/${bucket}/${objectKey}`;
|
|
|
|
try {
|
|
const text = await readObjectText(runner, remotePath, objectKey, env);
|
|
|
|
if (!force && !isExtractableBinaryObject(objectKey) && !isLikelyText(text)) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: `Object appears binary and cannot be ingested safely: ${objectKey}. Use force=true to override.`,
|
|
};
|
|
}
|
|
|
|
const trimmed = text.trim();
|
|
if (!trimmed) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: `Object is empty: minio://${bucket}/${objectKey}`,
|
|
};
|
|
}
|
|
|
|
const clipped = trimmed.length > maxChars
|
|
? `${trimmed.slice(0, maxChars)}\n\n[truncated to ${maxChars} chars]`
|
|
: trimmed;
|
|
const importedAt = (deps?.now ? deps.now() : new Date()).toISOString();
|
|
const payload = `## MinIO Import\nsource: minio://${bucket}/${objectKey}\nimported_at: ${importedAt}\n\n${clipped}`;
|
|
store.write(namespace, payload, mode);
|
|
|
|
return {
|
|
success: true,
|
|
output: `Ingested MinIO object into memory.\nSource: minio://${bucket}/${objectKey}\nNamespace: ${namespace}\nMode: ${mode}`,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: error instanceof Error ? error.message : String(error),
|
|
};
|
|
}
|
|
},
|
|
};
|
|
}
|
|
function toText(value: string | Buffer): string {
|
|
return typeof value === 'string' ? value : value.toString('utf-8');
|
|
}
|