941 lines
28 KiB
TypeScript
941 lines
28 KiB
TypeScript
import { execFile as execFileCb } from 'node:child_process';
|
|
import { promisify } from 'node:util';
|
|
import { redactForAudit } from '../../audit/redact.js';
|
|
import type { Config } from '../../config/index.js';
|
|
import { listDockerDependencyStatuses } from './dockerDependencies.js';
|
|
import { listLocalBackendStatuses } from './localBackends.js';
|
|
|
|
const execFile = promisify(execFileCb);
|
|
const COMPOSE_FILE = 'docker-compose.yml';
|
|
const DEFAULT_FLYNN_UNIT = 'flynn.service';
|
|
const DEFAULT_WINDOW_MINUTES = 60;
|
|
const MAX_WINDOW_MINUTES = 240;
|
|
const DEFAULT_BUCKET_SECONDS = 30;
|
|
const ALLOWED_BUCKET_SECONDS = [15, 30, 60] as const;
|
|
const DEFAULT_LOG_LINES = 200;
|
|
const MAX_LOG_LINES = 1000;
|
|
const DEFAULT_LOG_SINCE_SECONDS = 900;
|
|
const MAX_LOG_SINCE_SECONDS = 86_400;
|
|
const DEFAULT_SAMPLE_INTERVAL_MS = 30_000;
|
|
const DEFAULT_MAX_SAMPLES = 720;
|
|
const DEFAULT_TIMEOUT_MS = 10_000;
|
|
const LARGE_TIMEOUT_MS = 15_000;
|
|
const LARGE_BUFFER_BYTES = 4 * 1024 * 1024;
|
|
|
|
const STATE_UNAVAILABLE = 0;
|
|
const STATE_STOPPED = 1;
|
|
const STATE_DEGRADED = 2;
|
|
const STATE_RUNNING = 3;
|
|
|
|
const HEALTH_UNKNOWN = 0;
|
|
const HEALTH_DEGRADED = 1;
|
|
const HEALTH_HEALTHY = 2;
|
|
|
|
export type ObservabilitySourceKind = 'docker_dependency' | 'systemd_user' | 'systemd_system';
|
|
export type ObservabilityRuntime = 'docker_compose' | 'systemd_user' | 'systemd_system';
|
|
export type ObservabilitySourceStatus = 'running' | 'degraded' | 'stopped' | 'unavailable' | 'unknown';
|
|
|
|
export interface ObservabilitySource {
|
|
id: string;
|
|
name: string;
|
|
kind: ObservabilitySourceKind;
|
|
runtime: ObservabilityRuntime;
|
|
status: ObservabilitySourceStatus;
|
|
graphCapable: boolean;
|
|
logCapable: boolean;
|
|
metadata?: {
|
|
unit?: string;
|
|
service?: string;
|
|
state?: string;
|
|
health?: string;
|
|
statusText?: string;
|
|
containerName?: string | null;
|
|
};
|
|
}
|
|
|
|
export interface ObservabilitySeriesPoint {
|
|
ts: number;
|
|
stateCode: number;
|
|
healthCode: number;
|
|
errorCount: number;
|
|
restartCount: number;
|
|
}
|
|
|
|
export interface ObservabilitySeriesEntry {
|
|
sourceId: string;
|
|
points: ObservabilitySeriesPoint[];
|
|
}
|
|
|
|
export interface ObservabilitySeriesSnapshot {
|
|
generatedAt: number;
|
|
windowMinutes: number;
|
|
bucketSeconds: number;
|
|
series: ObservabilitySeriesEntry[];
|
|
}
|
|
|
|
export interface ServiceLogEntry {
|
|
ts?: number;
|
|
level?: 'info' | 'warn' | 'error';
|
|
text: string;
|
|
}
|
|
|
|
export interface ServiceLogSnapshot {
|
|
sourceId: string;
|
|
fetchedAt: number;
|
|
redacted: boolean;
|
|
lines: ServiceLogEntry[];
|
|
truncated: boolean;
|
|
}
|
|
|
|
export interface ServiceLogQuery {
|
|
sourceId: string;
|
|
lines?: number;
|
|
sinceSeconds?: number;
|
|
}
|
|
|
|
export interface ObservabilitySeriesQuery {
|
|
windowMinutes?: number;
|
|
bucketSeconds?: number;
|
|
sourceIds?: string[];
|
|
}
|
|
|
|
type ExecResult = { stdout: string; stderr: string };
|
|
type CommandRunner = (
|
|
command: string,
|
|
args: string[],
|
|
opts?: { timeoutMs?: number; maxBufferBytes?: number },
|
|
) => Promise<ExecResult>;
|
|
|
|
interface SourceSnapshot {
|
|
source: ObservabilitySource;
|
|
stateCode: number;
|
|
healthCode: number;
|
|
hasError: boolean;
|
|
fingerprint: string | null;
|
|
}
|
|
|
|
interface SourceCounter {
|
|
errorCount: number;
|
|
restartCount: number;
|
|
lastStateCode: number;
|
|
lastFingerprint: string | null;
|
|
hasPrevious: boolean;
|
|
}
|
|
|
|
interface SampleRecord {
|
|
ts: number;
|
|
stateCode: number;
|
|
healthCode: number;
|
|
errorCount: number;
|
|
restartCount: number;
|
|
}
|
|
|
|
function isGatewayRunningUnderSystemd(): boolean {
|
|
return Boolean(process.env.INVOCATION_ID || process.env.JOURNAL_STREAM || process.env.SYSTEMD_EXEC_PID);
|
|
}
|
|
|
|
function defaultRunner(
|
|
command: string,
|
|
args: string[],
|
|
opts?: { timeoutMs?: number; maxBufferBytes?: number },
|
|
): Promise<ExecResult> {
|
|
return execFile(command, args, {
|
|
timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT_MS,
|
|
maxBuffer: opts?.maxBufferBytes ?? LARGE_BUFFER_BYTES,
|
|
}) as Promise<ExecResult>;
|
|
}
|
|
|
|
function normalizeError(error: unknown): string {
|
|
if (error && typeof error === 'object') {
|
|
const maybe = error as { stderr?: string; stdout?: string; message?: string };
|
|
const stderr = maybe.stderr?.trim();
|
|
if (stderr) {return stderr;}
|
|
const stdout = maybe.stdout?.trim();
|
|
if (stdout) {return stdout;}
|
|
if (typeof maybe.message === 'string' && maybe.message.trim().length > 0) {
|
|
return maybe.message.trim();
|
|
}
|
|
}
|
|
if (error instanceof Error && error.message.trim().length > 0) {
|
|
return error.message.trim();
|
|
}
|
|
return String(error);
|
|
}
|
|
|
|
function parseKeyValueOutput(output: string): Record<string, string> {
|
|
const result: Record<string, string> = {};
|
|
for (const line of output.split('\n')) {
|
|
if (!line.trim()) {continue;}
|
|
const idx = line.indexOf('=');
|
|
if (idx <= 0) {continue;}
|
|
const key = line.slice(0, idx).trim();
|
|
const value = line.slice(idx + 1).trim();
|
|
result[key] = value;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
function parseInteger(input: string | undefined): number | null {
|
|
if (!input) {return null;}
|
|
const parsed = Number(input);
|
|
if (!Number.isFinite(parsed)) {return null;}
|
|
if (parsed <= 0) {return null;}
|
|
return Math.floor(parsed);
|
|
}
|
|
|
|
function mapSystemdStatus(activeState: string, error?: string): {
|
|
status: ObservabilitySourceStatus;
|
|
stateCode: number;
|
|
healthCode: number;
|
|
} {
|
|
if (error) {
|
|
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
|
|
const state = activeState.trim().toLowerCase();
|
|
if (state === 'active') {
|
|
return { status: 'running', stateCode: STATE_RUNNING, healthCode: HEALTH_HEALTHY };
|
|
}
|
|
if (state === 'inactive' || state === 'dead') {
|
|
return { status: 'stopped', stateCode: STATE_STOPPED, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
if (state === 'failed') {
|
|
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
|
}
|
|
if (state === 'activating' || state === 'deactivating' || state === 'reloading') {
|
|
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
|
}
|
|
if (state === 'not-found') {
|
|
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
return { status: 'unknown', stateCode: STATE_DEGRADED, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
|
|
function mapDockerStatus(stateRaw: string, healthRaw: string, error?: string): {
|
|
status: ObservabilitySourceStatus;
|
|
stateCode: number;
|
|
healthCode: number;
|
|
} {
|
|
if (error) {
|
|
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
|
|
const state = stateRaw.trim().toLowerCase();
|
|
const health = healthRaw.trim().toLowerCase();
|
|
|
|
if (state === 'running') {
|
|
if (health === 'healthy' || health === 'none' || health === 'unknown') {
|
|
return {
|
|
status: 'running',
|
|
stateCode: STATE_RUNNING,
|
|
healthCode: health === 'healthy' ? HEALTH_HEALTHY : HEALTH_UNKNOWN,
|
|
};
|
|
}
|
|
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
|
}
|
|
|
|
if (state === 'stopped' || state === 'not-created' || state === 'created') {
|
|
return { status: 'stopped', stateCode: STATE_STOPPED, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
|
|
if (state === 'unavailable' || state === 'not-found') {
|
|
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
|
|
if (state === 'restarting' || state === 'paused') {
|
|
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
|
}
|
|
|
|
return { status: 'unknown', stateCode: STATE_DEGRADED, healthCode: HEALTH_UNKNOWN };
|
|
}
|
|
|
|
function classifyLogLevel(text: string): 'info' | 'warn' | 'error' {
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes('error') || lower.includes('failed') || lower.includes('fatal') || lower.includes('panic')) {
|
|
return 'error';
|
|
}
|
|
if (lower.includes('warn') || lower.includes('warning') || lower.includes('degraded')) {
|
|
return 'warn';
|
|
}
|
|
return 'info';
|
|
}
|
|
|
|
function normalizeOffset(ts: string): string {
|
|
return ts.replace(/([+-]\d{2})(\d{2})$/, '$1:$2');
|
|
}
|
|
|
|
function parseTimestamp(raw: string | undefined): number | undefined {
|
|
if (!raw) {return undefined;}
|
|
const candidates = [
|
|
raw,
|
|
normalizeOffset(raw),
|
|
raw.includes(' ') ? raw.replace(' ', 'T') : raw,
|
|
normalizeOffset(raw.includes(' ') ? raw.replace(' ', 'T') : raw),
|
|
];
|
|
|
|
for (const candidate of candidates) {
|
|
const parsed = Date.parse(candidate);
|
|
if (!Number.isNaN(parsed)) {
|
|
return parsed;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function sanitizeWindowMinutes(value: number | undefined): number {
|
|
const parsed = Math.floor(Number(value ?? DEFAULT_WINDOW_MINUTES));
|
|
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
return DEFAULT_WINDOW_MINUTES;
|
|
}
|
|
return Math.min(MAX_WINDOW_MINUTES, parsed);
|
|
}
|
|
|
|
function sanitizeBucketSeconds(value: number | undefined): number {
|
|
const parsed = Math.floor(Number(value ?? DEFAULT_BUCKET_SECONDS));
|
|
if ((ALLOWED_BUCKET_SECONDS as readonly number[]).includes(parsed)) {
|
|
return parsed;
|
|
}
|
|
return DEFAULT_BUCKET_SECONDS;
|
|
}
|
|
|
|
function sanitizeLogLines(value: number | undefined): number {
|
|
const parsed = Math.floor(Number(value ?? DEFAULT_LOG_LINES));
|
|
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
return DEFAULT_LOG_LINES;
|
|
}
|
|
return Math.min(MAX_LOG_LINES, parsed);
|
|
}
|
|
|
|
function sanitizeSinceSeconds(value: number | undefined): number {
|
|
const parsed = Math.floor(Number(value ?? DEFAULT_LOG_SINCE_SECONDS));
|
|
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
return DEFAULT_LOG_SINCE_SECONDS;
|
|
}
|
|
return Math.min(MAX_LOG_SINCE_SECONDS, parsed);
|
|
}
|
|
|
|
function normalizeSourceId(value: string): string {
|
|
return value.trim();
|
|
}
|
|
|
|
function redactLogText(text: string): { text: string; redacted: boolean } {
|
|
const result = redactForAudit(text);
|
|
return {
|
|
text: typeof result.value === 'string' ? result.value : String(result.value),
|
|
redacted: result.redactions > 0,
|
|
};
|
|
}
|
|
|
|
function splitDockerLogLine(line: string): { ts?: number; level: 'info' | 'warn' | 'error'; text: string; redacted: boolean } {
|
|
let content = line.trim();
|
|
const prefixed = content.match(/^([A-Za-z0-9_.-]+)\s+\|\s+(.*)$/);
|
|
if (prefixed) {
|
|
content = prefixed[2]?.trim() ?? content;
|
|
}
|
|
|
|
const timestamped = content.match(/^(\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d(?:\.\d+)?(?:Z|[+-]\d\d:?\d\d))\s+(.*)$/);
|
|
let ts: number | undefined;
|
|
if (timestamped) {
|
|
ts = parseTimestamp(timestamped[1]);
|
|
content = timestamped[2] ?? content;
|
|
}
|
|
|
|
const redacted = redactLogText(content);
|
|
return {
|
|
ts,
|
|
level: classifyLogLevel(redacted.text),
|
|
text: redacted.text,
|
|
redacted: redacted.redacted,
|
|
};
|
|
}
|
|
|
|
function splitJournalLine(line: string): { ts?: number; level: 'info' | 'warn' | 'error'; text: string; redacted: boolean } {
|
|
let content = line.trim();
|
|
let ts: number | undefined;
|
|
|
|
const timestamped = content.match(/^([0-9T:\-+. ]{19,})(\s+.*)$/);
|
|
if (timestamped) {
|
|
const parsedTs = parseTimestamp(timestamped[1]?.trim());
|
|
if (parsedTs !== undefined) {
|
|
ts = parsedTs;
|
|
content = timestamped[2]?.trim() ?? content;
|
|
}
|
|
}
|
|
|
|
const redacted = redactLogText(content);
|
|
return {
|
|
ts,
|
|
level: classifyLogLevel(redacted.text),
|
|
text: redacted.text,
|
|
redacted: redacted.redacted,
|
|
};
|
|
}
|
|
|
|
interface SystemdStatus {
|
|
unit: string;
|
|
name: string;
|
|
loadState: string;
|
|
activeState: string;
|
|
subState: string;
|
|
statusText: string;
|
|
pid: number | null;
|
|
result: string;
|
|
error?: string;
|
|
}
|
|
|
|
async function fetchSystemdUnitStatus(
|
|
runner: CommandRunner,
|
|
opts: { unit: string; name: string; user: boolean },
|
|
): Promise<SystemdStatus> {
|
|
const args = [
|
|
...(opts.user ? ['--user'] : []),
|
|
'show',
|
|
opts.unit,
|
|
'--property=LoadState,ActiveState,SubState,Description,ExecMainPID,Result',
|
|
'--no-pager',
|
|
];
|
|
|
|
try {
|
|
const response = await runner('systemctl', args, {
|
|
timeoutMs: DEFAULT_TIMEOUT_MS,
|
|
maxBufferBytes: 1024 * 1024,
|
|
});
|
|
const parsed = parseKeyValueOutput(response.stdout);
|
|
const loadState = parsed.LoadState || 'unknown';
|
|
const activeState = parsed.ActiveState || 'unknown';
|
|
const subState = parsed.SubState || 'unknown';
|
|
const description = parsed.Description || opts.name;
|
|
const pid = parseInteger(parsed.ExecMainPID);
|
|
const result = parsed.Result || 'unknown';
|
|
return {
|
|
unit: opts.unit,
|
|
name: description,
|
|
loadState,
|
|
activeState,
|
|
subState,
|
|
statusText: activeState === subState ? activeState : `${activeState}/${subState}`,
|
|
pid,
|
|
result,
|
|
};
|
|
} catch (error) {
|
|
const detail = normalizeError(error);
|
|
return {
|
|
unit: opts.unit,
|
|
name: opts.name,
|
|
loadState: 'unknown',
|
|
activeState: detail.toLowerCase().includes('not found') ? 'not-found' : 'unknown',
|
|
subState: 'unknown',
|
|
statusText: 'unavailable',
|
|
pid: null,
|
|
result: 'unknown',
|
|
error: detail,
|
|
};
|
|
}
|
|
}
|
|
|
|
async function discoverComposeProfileArgs(runner: CommandRunner): Promise<string[]> {
|
|
try {
|
|
const response = await runner('docker', ['compose', '-f', COMPOSE_FILE, 'config', '--profiles'], {
|
|
timeoutMs: DEFAULT_TIMEOUT_MS,
|
|
maxBufferBytes: LARGE_BUFFER_BYTES,
|
|
});
|
|
const profiles = response.stdout
|
|
.split('\n')
|
|
.map((line) => line.trim())
|
|
.filter((line) => line.length > 0);
|
|
|
|
const unique = new Set<string>();
|
|
for (const profile of profiles) {
|
|
unique.add(profile);
|
|
}
|
|
const args: string[] = [];
|
|
for (const profile of unique) {
|
|
args.push('--profile', profile);
|
|
}
|
|
return args;
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
export interface ObservabilityCollectorOptions {
|
|
config: Config;
|
|
flynnSystemdUnit?: string;
|
|
samplingIntervalMs?: number;
|
|
maxSamplesPerSource?: number;
|
|
now?: () => number;
|
|
runner?: CommandRunner;
|
|
}
|
|
|
|
/**
|
|
* Collects bounded service status samples and log snapshots for the web dashboard.
|
|
*/
|
|
export class ObservabilityCollector {
|
|
private readonly config: Config;
|
|
private readonly flynnSystemdUnit: string;
|
|
private readonly samplingIntervalMs: number;
|
|
private readonly maxSamplesPerSource: number;
|
|
private readonly now: () => number;
|
|
private readonly runner: CommandRunner;
|
|
|
|
private sampleTimer: NodeJS.Timeout | null = null;
|
|
private inFlightSample: Promise<void> | null = null;
|
|
private readonly sourceHistory = new Map<string, SampleRecord[]>();
|
|
private readonly sourceMeta = new Map<string, ObservabilitySource>();
|
|
private readonly sourceCounters = new Map<string, SourceCounter>();
|
|
|
|
constructor(options: ObservabilityCollectorOptions) {
|
|
this.config = options.config;
|
|
this.flynnSystemdUnit = options.flynnSystemdUnit ?? DEFAULT_FLYNN_UNIT;
|
|
this.samplingIntervalMs = Math.max(5_000, Math.floor(options.samplingIntervalMs ?? DEFAULT_SAMPLE_INTERVAL_MS));
|
|
this.maxSamplesPerSource = Math.max(60, Math.floor(options.maxSamplesPerSource ?? DEFAULT_MAX_SAMPLES));
|
|
this.now = options.now ?? (() => Date.now());
|
|
this.runner = options.runner ?? defaultRunner;
|
|
}
|
|
|
|
start(): void {
|
|
if (this.sampleTimer) {
|
|
return;
|
|
}
|
|
void this.forceSample();
|
|
this.sampleTimer = setInterval(() => {
|
|
void this.forceSample();
|
|
}, this.samplingIntervalMs);
|
|
this.sampleTimer.unref?.();
|
|
}
|
|
|
|
stop(): void {
|
|
if (!this.sampleTimer) {
|
|
return;
|
|
}
|
|
clearInterval(this.sampleTimer);
|
|
this.sampleTimer = null;
|
|
}
|
|
|
|
async listSources(): Promise<ObservabilitySource[]> {
|
|
await this.ensureSampled();
|
|
return Array.from(this.sourceMeta.values())
|
|
.sort((a, b) => a.name.localeCompare(b.name));
|
|
}
|
|
|
|
async getSeries(query?: ObservabilitySeriesQuery): Promise<ObservabilitySeriesSnapshot> {
|
|
await this.ensureSampled();
|
|
|
|
const now = this.now();
|
|
const windowMinutes = sanitizeWindowMinutes(query?.windowMinutes);
|
|
const bucketSeconds = sanitizeBucketSeconds(query?.bucketSeconds);
|
|
const bucketMs = bucketSeconds * 1000;
|
|
const lowerBound = now - (windowMinutes * 60_000);
|
|
|
|
const sourceFilter = query?.sourceIds && query.sourceIds.length > 0
|
|
? new Set(query.sourceIds.map((id) => normalizeSourceId(id)).filter((id) => id.length > 0))
|
|
: null;
|
|
|
|
const series: ObservabilitySeriesEntry[] = [];
|
|
for (const [sourceId, history] of this.sourceHistory.entries()) {
|
|
if (sourceFilter && !sourceFilter.has(sourceId)) {
|
|
continue;
|
|
}
|
|
|
|
const bucketMap = new Map<number, SampleRecord>();
|
|
for (const sample of history) {
|
|
if (sample.ts < lowerBound) {
|
|
continue;
|
|
}
|
|
const bucketTs = Math.floor(sample.ts / bucketMs) * bucketMs;
|
|
bucketMap.set(bucketTs, sample);
|
|
}
|
|
|
|
const points = Array.from(bucketMap.entries())
|
|
.sort((a, b) => a[0] - b[0])
|
|
.map(([bucketTs, sample]) => ({
|
|
ts: bucketTs,
|
|
stateCode: sample.stateCode,
|
|
healthCode: sample.healthCode,
|
|
errorCount: sample.errorCount,
|
|
restartCount: sample.restartCount,
|
|
}));
|
|
|
|
series.push({ sourceId, points });
|
|
}
|
|
|
|
series.sort((a, b) => {
|
|
const left = this.sourceMeta.get(a.sourceId)?.name ?? a.sourceId;
|
|
const right = this.sourceMeta.get(b.sourceId)?.name ?? b.sourceId;
|
|
return left.localeCompare(right);
|
|
});
|
|
|
|
return {
|
|
generatedAt: now,
|
|
windowMinutes,
|
|
bucketSeconds,
|
|
series,
|
|
};
|
|
}
|
|
|
|
async getServiceLogs(query: ServiceLogQuery): Promise<ServiceLogSnapshot> {
|
|
await this.ensureSampled();
|
|
|
|
const sourceId = normalizeSourceId(query.sourceId);
|
|
if (!sourceId) {
|
|
throw new Error('sourceId is required');
|
|
}
|
|
|
|
const source = this.sourceMeta.get(sourceId);
|
|
if (!source || !source.logCapable) {
|
|
throw new Error(`Log source not found or unavailable: ${sourceId}`);
|
|
}
|
|
|
|
const lines = sanitizeLogLines(query.lines);
|
|
const sinceSeconds = sanitizeSinceSeconds(query.sinceSeconds);
|
|
|
|
if (source.runtime === 'docker_compose') {
|
|
return this.fetchDockerLogs(source, lines, sinceSeconds);
|
|
}
|
|
if (source.runtime === 'systemd_user' || source.runtime === 'systemd_system') {
|
|
return this.fetchJournalLogs(source, lines, sinceSeconds);
|
|
}
|
|
|
|
throw new Error(`Unsupported log runtime for source: ${sourceId}`);
|
|
}
|
|
|
|
async forceSample(): Promise<void> {
|
|
if (this.inFlightSample) {
|
|
await this.inFlightSample;
|
|
return;
|
|
}
|
|
|
|
this.inFlightSample = this.collectSample()
|
|
.catch(() => {
|
|
// Keep sampling resilient; errors are reflected as unavailable source snapshots.
|
|
})
|
|
.finally(() => {
|
|
this.inFlightSample = null;
|
|
});
|
|
|
|
await this.inFlightSample;
|
|
}
|
|
|
|
private async ensureSampled(): Promise<void> {
|
|
if (this.sourceMeta.size > 0) {
|
|
return;
|
|
}
|
|
await this.forceSample();
|
|
}
|
|
|
|
private async collectSample(): Promise<void> {
|
|
const sampleTime = this.now();
|
|
|
|
const [flynnResult, localBackendsResult, dockerDependenciesResult] = await Promise.allSettled([
|
|
fetchSystemdUnitStatus(this.runner, {
|
|
unit: this.flynnSystemdUnit,
|
|
name: 'Flynn daemon',
|
|
user: false,
|
|
}),
|
|
listLocalBackendStatuses(this.config, async (args: string[]) => {
|
|
return this.runner('systemctl', args, {
|
|
timeoutMs: DEFAULT_TIMEOUT_MS,
|
|
maxBufferBytes: 1024 * 1024,
|
|
});
|
|
}),
|
|
listDockerDependencyStatuses(this.config, async (args: string[]) => {
|
|
return this.runner('docker', ['compose', '-f', COMPOSE_FILE, ...args], {
|
|
timeoutMs: DEFAULT_TIMEOUT_MS,
|
|
maxBufferBytes: LARGE_BUFFER_BYTES,
|
|
});
|
|
}),
|
|
]);
|
|
|
|
const flynnStatus = flynnResult.status === 'fulfilled'
|
|
? flynnResult.value
|
|
: {
|
|
unit: this.flynnSystemdUnit,
|
|
name: 'Flynn daemon',
|
|
loadState: 'unknown',
|
|
activeState: 'unknown',
|
|
subState: 'unknown',
|
|
statusText: 'unavailable',
|
|
pid: null,
|
|
result: 'unknown',
|
|
error: normalizeError(flynnResult.reason),
|
|
};
|
|
const localBackends = localBackendsResult.status === 'fulfilled'
|
|
? localBackendsResult.value
|
|
: [];
|
|
const dockerDependencies = dockerDependenciesResult.status === 'fulfilled'
|
|
? dockerDependenciesResult.value
|
|
: [];
|
|
|
|
const snapshots: SourceSnapshot[] = [];
|
|
|
|
const flynnMapped = mapSystemdStatus(flynnStatus.activeState, flynnStatus.error);
|
|
const fallbackToProcessRuntime = !isGatewayRunningUnderSystemd() && flynnMapped.status !== 'running';
|
|
const flynnStatusValue = fallbackToProcessRuntime ? 'running' : flynnMapped.status;
|
|
const flynnStateCode = fallbackToProcessRuntime ? STATE_RUNNING : flynnMapped.stateCode;
|
|
const flynnHealthCode = fallbackToProcessRuntime ? HEALTH_HEALTHY : flynnMapped.healthCode;
|
|
snapshots.push({
|
|
source: {
|
|
id: 'systemd:flynn',
|
|
name: 'Flynn daemon',
|
|
kind: 'systemd_system',
|
|
runtime: 'systemd_system',
|
|
status: flynnStatusValue,
|
|
graphCapable: true,
|
|
logCapable: !fallbackToProcessRuntime,
|
|
metadata: {
|
|
unit: this.flynnSystemdUnit,
|
|
state: fallbackToProcessRuntime ? 'running' : flynnStatus.activeState,
|
|
statusText: fallbackToProcessRuntime ? 'running (gateway process)' : flynnStatus.statusText,
|
|
},
|
|
},
|
|
stateCode: flynnStateCode,
|
|
healthCode: flynnHealthCode,
|
|
hasError: fallbackToProcessRuntime ? false : Boolean(flynnStatus.error),
|
|
fingerprint: flynnStatus.pid ? `pid:${flynnStatus.pid}` : null,
|
|
});
|
|
|
|
for (const backend of localBackends) {
|
|
const mapped = mapSystemdStatus(String(backend.activeState ?? ''), backend.error);
|
|
snapshots.push({
|
|
source: {
|
|
id: `systemd-user:${backend.id}`,
|
|
name: backend.name,
|
|
kind: 'systemd_user',
|
|
runtime: 'systemd_user',
|
|
status: mapped.status,
|
|
graphCapable: true,
|
|
logCapable: backend.loadState !== 'not-found',
|
|
metadata: {
|
|
unit: backend.unit,
|
|
state: backend.activeState,
|
|
statusText: backend.statusText,
|
|
},
|
|
},
|
|
stateCode: mapped.stateCode,
|
|
healthCode: mapped.healthCode,
|
|
hasError: Boolean(backend.error) || backend.activeState === 'failed',
|
|
fingerprint: backend.pid ? `pid:${backend.pid}` : null,
|
|
});
|
|
}
|
|
|
|
for (const dependency of dockerDependencies) {
|
|
const mapped = mapDockerStatus(dependency.state, dependency.health, dependency.error);
|
|
snapshots.push({
|
|
source: {
|
|
id: `docker:${dependency.id}`,
|
|
name: dependency.name,
|
|
kind: 'docker_dependency',
|
|
runtime: 'docker_compose',
|
|
status: mapped.status,
|
|
graphCapable: true,
|
|
logCapable: dependency.id !== 'compose',
|
|
metadata: {
|
|
service: dependency.service,
|
|
state: dependency.state,
|
|
health: dependency.health,
|
|
statusText: dependency.statusText,
|
|
containerName: dependency.containerName,
|
|
},
|
|
},
|
|
stateCode: mapped.stateCode,
|
|
healthCode: mapped.healthCode,
|
|
hasError: Boolean(dependency.error) || mapped.status === 'degraded' || mapped.status === 'unavailable',
|
|
fingerprint: dependency.containerName ? `container:${dependency.containerName}` : null,
|
|
});
|
|
}
|
|
|
|
const seenSourceIds = new Set<string>();
|
|
for (const snapshot of snapshots) {
|
|
seenSourceIds.add(snapshot.source.id);
|
|
this.recordSourceSample(snapshot, sampleTime);
|
|
}
|
|
|
|
// Keep stale entries visible as unavailable when they disappear.
|
|
for (const [sourceId, source] of this.sourceMeta.entries()) {
|
|
if (seenSourceIds.has(sourceId)) {
|
|
continue;
|
|
}
|
|
this.recordSourceSample({
|
|
source: {
|
|
...source,
|
|
status: 'unavailable',
|
|
},
|
|
stateCode: STATE_UNAVAILABLE,
|
|
healthCode: HEALTH_UNKNOWN,
|
|
hasError: true,
|
|
fingerprint: null,
|
|
}, sampleTime);
|
|
}
|
|
}
|
|
|
|
private recordSourceSample(snapshot: SourceSnapshot, timestamp: number): void {
|
|
this.sourceMeta.set(snapshot.source.id, snapshot.source);
|
|
|
|
const counter = this.sourceCounters.get(snapshot.source.id) ?? {
|
|
errorCount: 0,
|
|
restartCount: 0,
|
|
lastStateCode: snapshot.stateCode,
|
|
lastFingerprint: snapshot.fingerprint,
|
|
hasPrevious: false,
|
|
} satisfies SourceCounter;
|
|
|
|
if (snapshot.hasError) {
|
|
counter.errorCount += 1;
|
|
}
|
|
|
|
if (counter.hasPrevious) {
|
|
const enteredRunning = counter.lastStateCode !== STATE_RUNNING && snapshot.stateCode === STATE_RUNNING;
|
|
const fingerprintChanged = (
|
|
snapshot.stateCode === STATE_RUNNING
|
|
&& Boolean(snapshot.fingerprint)
|
|
&& Boolean(counter.lastFingerprint)
|
|
&& snapshot.fingerprint !== counter.lastFingerprint
|
|
);
|
|
if (enteredRunning || fingerprintChanged) {
|
|
counter.restartCount += 1;
|
|
}
|
|
}
|
|
|
|
counter.lastStateCode = snapshot.stateCode;
|
|
counter.lastFingerprint = snapshot.fingerprint;
|
|
counter.hasPrevious = true;
|
|
this.sourceCounters.set(snapshot.source.id, counter);
|
|
|
|
const records = this.sourceHistory.get(snapshot.source.id) ?? [];
|
|
records.push({
|
|
ts: timestamp,
|
|
stateCode: snapshot.stateCode,
|
|
healthCode: snapshot.healthCode,
|
|
errorCount: counter.errorCount,
|
|
restartCount: counter.restartCount,
|
|
});
|
|
|
|
if (records.length > this.maxSamplesPerSource) {
|
|
records.splice(0, records.length - this.maxSamplesPerSource);
|
|
}
|
|
|
|
this.sourceHistory.set(snapshot.source.id, records);
|
|
}
|
|
|
|
private async fetchDockerLogs(
|
|
source: ObservabilitySource,
|
|
lines: number,
|
|
sinceSeconds: number,
|
|
): Promise<ServiceLogSnapshot> {
|
|
const service = source.metadata?.service;
|
|
if (!service) {
|
|
throw new Error(`Missing docker compose service metadata for source: ${source.id}`);
|
|
}
|
|
|
|
const profileArgs = await discoverComposeProfileArgs(this.runner);
|
|
const response = await this.runner(
|
|
'docker',
|
|
[
|
|
'compose',
|
|
'-f',
|
|
COMPOSE_FILE,
|
|
...profileArgs,
|
|
'logs',
|
|
'--timestamps',
|
|
'--tail',
|
|
String(lines),
|
|
'--since',
|
|
`${sinceSeconds}s`,
|
|
service,
|
|
],
|
|
{ timeoutMs: LARGE_TIMEOUT_MS, maxBufferBytes: LARGE_BUFFER_BYTES },
|
|
);
|
|
|
|
const rawLines = response.stdout
|
|
.split('\n')
|
|
.map((line) => line.trimEnd())
|
|
.filter((line) => line.trim().length > 0);
|
|
|
|
let redacted = false;
|
|
const parsed: ServiceLogEntry[] = rawLines.map((line) => {
|
|
const item = splitDockerLogLine(line);
|
|
redacted = redacted || item.redacted;
|
|
return {
|
|
ts: item.ts,
|
|
level: item.level,
|
|
text: item.text,
|
|
};
|
|
});
|
|
|
|
return {
|
|
sourceId: source.id,
|
|
fetchedAt: this.now(),
|
|
redacted,
|
|
lines: parsed,
|
|
truncated: parsed.length >= lines,
|
|
};
|
|
}
|
|
|
|
private async fetchJournalLogs(
|
|
source: ObservabilitySource,
|
|
lines: number,
|
|
sinceSeconds: number,
|
|
): Promise<ServiceLogSnapshot> {
|
|
const unit = source.metadata?.unit;
|
|
if (!unit) {
|
|
throw new Error(`Missing systemd unit metadata for source: ${source.id}`);
|
|
}
|
|
|
|
const user = source.runtime === 'systemd_user';
|
|
const response = await this.runner(
|
|
'journalctl',
|
|
[
|
|
...(user ? ['--user'] : []),
|
|
'-u',
|
|
unit,
|
|
'--since',
|
|
`${sinceSeconds} seconds ago`,
|
|
'--no-pager',
|
|
'--output',
|
|
'short-iso-precise',
|
|
'-n',
|
|
String(lines),
|
|
],
|
|
{
|
|
timeoutMs: LARGE_TIMEOUT_MS,
|
|
maxBufferBytes: LARGE_BUFFER_BYTES,
|
|
},
|
|
);
|
|
|
|
const rawLines = response.stdout
|
|
.split('\n')
|
|
.map((line) => line.trimEnd())
|
|
.filter((line) => line.trim().length > 0);
|
|
|
|
let redacted = false;
|
|
const parsed: ServiceLogEntry[] = rawLines.map((line) => {
|
|
const item = splitJournalLine(line);
|
|
redacted = redacted || item.redacted;
|
|
return {
|
|
ts: item.ts,
|
|
level: item.level,
|
|
text: item.text,
|
|
};
|
|
});
|
|
|
|
return {
|
|
sourceId: source.id,
|
|
fetchedAt: this.now(),
|
|
redacted,
|
|
lines: parsed,
|
|
truncated: parsed.length >= lines,
|
|
};
|
|
}
|
|
}
|
|
|
|
export const observabilityDefaults = {
|
|
DEFAULT_WINDOW_MINUTES,
|
|
MAX_WINDOW_MINUTES,
|
|
DEFAULT_BUCKET_SECONDS,
|
|
ALLOWED_BUCKET_SECONDS: [...ALLOWED_BUCKET_SECONDS],
|
|
DEFAULT_LOG_LINES,
|
|
MAX_LOG_LINES,
|
|
DEFAULT_LOG_SINCE_SECONDS,
|
|
MAX_LOG_SINCE_SECONDS,
|
|
} as const;
|