feat(gateway): add observability sources, series, and service log RPCs
This commit is contained in:
@@ -0,0 +1,932 @@
|
||||
import { execFile as execFileCb } from 'node:child_process';
|
||||
import { promisify } from 'node:util';
|
||||
import { redactForAudit } from '../../audit/redact.js';
|
||||
import type { Config } from '../../config/index.js';
|
||||
import { listDockerDependencyStatuses } from './dockerDependencies.js';
|
||||
import { listLocalBackendStatuses } from './localBackends.js';
|
||||
|
||||
const execFile = promisify(execFileCb);
|
||||
const COMPOSE_FILE = 'docker-compose.yml';
|
||||
const DEFAULT_FLYNN_UNIT = 'flynn.service';
|
||||
const DEFAULT_WINDOW_MINUTES = 60;
|
||||
const MAX_WINDOW_MINUTES = 240;
|
||||
const DEFAULT_BUCKET_SECONDS = 30;
|
||||
const ALLOWED_BUCKET_SECONDS = [15, 30, 60] as const;
|
||||
const DEFAULT_LOG_LINES = 200;
|
||||
const MAX_LOG_LINES = 1000;
|
||||
const DEFAULT_LOG_SINCE_SECONDS = 900;
|
||||
const MAX_LOG_SINCE_SECONDS = 86_400;
|
||||
const DEFAULT_SAMPLE_INTERVAL_MS = 30_000;
|
||||
const DEFAULT_MAX_SAMPLES = 720;
|
||||
const DEFAULT_TIMEOUT_MS = 10_000;
|
||||
const LARGE_TIMEOUT_MS = 15_000;
|
||||
const LARGE_BUFFER_BYTES = 4 * 1024 * 1024;
|
||||
|
||||
const STATE_UNAVAILABLE = 0;
|
||||
const STATE_STOPPED = 1;
|
||||
const STATE_DEGRADED = 2;
|
||||
const STATE_RUNNING = 3;
|
||||
|
||||
const HEALTH_UNKNOWN = 0;
|
||||
const HEALTH_DEGRADED = 1;
|
||||
const HEALTH_HEALTHY = 2;
|
||||
|
||||
export type ObservabilitySourceKind = 'docker_dependency' | 'systemd_user' | 'systemd_system';
|
||||
export type ObservabilityRuntime = 'docker_compose' | 'systemd_user' | 'systemd_system';
|
||||
export type ObservabilitySourceStatus = 'running' | 'degraded' | 'stopped' | 'unavailable' | 'unknown';
|
||||
|
||||
export interface ObservabilitySource {
|
||||
id: string;
|
||||
name: string;
|
||||
kind: ObservabilitySourceKind;
|
||||
runtime: ObservabilityRuntime;
|
||||
status: ObservabilitySourceStatus;
|
||||
graphCapable: boolean;
|
||||
logCapable: boolean;
|
||||
metadata?: {
|
||||
unit?: string;
|
||||
service?: string;
|
||||
state?: string;
|
||||
health?: string;
|
||||
statusText?: string;
|
||||
containerName?: string | null;
|
||||
};
|
||||
}
|
||||
|
||||
export interface ObservabilitySeriesPoint {
|
||||
ts: number;
|
||||
stateCode: number;
|
||||
healthCode: number;
|
||||
errorCount: number;
|
||||
restartCount: number;
|
||||
}
|
||||
|
||||
export interface ObservabilitySeriesEntry {
|
||||
sourceId: string;
|
||||
points: ObservabilitySeriesPoint[];
|
||||
}
|
||||
|
||||
export interface ObservabilitySeriesSnapshot {
|
||||
generatedAt: number;
|
||||
windowMinutes: number;
|
||||
bucketSeconds: number;
|
||||
series: ObservabilitySeriesEntry[];
|
||||
}
|
||||
|
||||
export interface ServiceLogEntry {
|
||||
ts?: number;
|
||||
level?: 'info' | 'warn' | 'error';
|
||||
text: string;
|
||||
}
|
||||
|
||||
export interface ServiceLogSnapshot {
|
||||
sourceId: string;
|
||||
fetchedAt: number;
|
||||
redacted: boolean;
|
||||
lines: ServiceLogEntry[];
|
||||
truncated: boolean;
|
||||
}
|
||||
|
||||
export interface ServiceLogQuery {
|
||||
sourceId: string;
|
||||
lines?: number;
|
||||
sinceSeconds?: number;
|
||||
}
|
||||
|
||||
export interface ObservabilitySeriesQuery {
|
||||
windowMinutes?: number;
|
||||
bucketSeconds?: number;
|
||||
sourceIds?: string[];
|
||||
}
|
||||
|
||||
type ExecResult = { stdout: string; stderr: string };
|
||||
type CommandRunner = (
|
||||
command: string,
|
||||
args: string[],
|
||||
opts?: { timeoutMs?: number; maxBufferBytes?: number },
|
||||
) => Promise<ExecResult>;
|
||||
|
||||
interface SourceSnapshot {
|
||||
source: ObservabilitySource;
|
||||
stateCode: number;
|
||||
healthCode: number;
|
||||
hasError: boolean;
|
||||
fingerprint: string | null;
|
||||
}
|
||||
|
||||
interface SourceCounter {
|
||||
errorCount: number;
|
||||
restartCount: number;
|
||||
lastStateCode: number;
|
||||
lastFingerprint: string | null;
|
||||
hasPrevious: boolean;
|
||||
}
|
||||
|
||||
interface SampleRecord {
|
||||
ts: number;
|
||||
stateCode: number;
|
||||
healthCode: number;
|
||||
errorCount: number;
|
||||
restartCount: number;
|
||||
}
|
||||
|
||||
function defaultRunner(
|
||||
command: string,
|
||||
args: string[],
|
||||
opts?: { timeoutMs?: number; maxBufferBytes?: number },
|
||||
): Promise<ExecResult> {
|
||||
return execFile(command, args, {
|
||||
timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT_MS,
|
||||
maxBuffer: opts?.maxBufferBytes ?? LARGE_BUFFER_BYTES,
|
||||
}) as Promise<ExecResult>;
|
||||
}
|
||||
|
||||
function normalizeError(error: unknown): string {
|
||||
if (error && typeof error === 'object') {
|
||||
const maybe = error as { stderr?: string; stdout?: string; message?: string };
|
||||
const stderr = maybe.stderr?.trim();
|
||||
if (stderr) {return stderr;}
|
||||
const stdout = maybe.stdout?.trim();
|
||||
if (stdout) {return stdout;}
|
||||
if (typeof maybe.message === 'string' && maybe.message.trim().length > 0) {
|
||||
return maybe.message.trim();
|
||||
}
|
||||
}
|
||||
if (error instanceof Error && error.message.trim().length > 0) {
|
||||
return error.message.trim();
|
||||
}
|
||||
return String(error);
|
||||
}
|
||||
|
||||
function parseKeyValueOutput(output: string): Record<string, string> {
|
||||
const result: Record<string, string> = {};
|
||||
for (const line of output.split('\n')) {
|
||||
if (!line.trim()) {continue;}
|
||||
const idx = line.indexOf('=');
|
||||
if (idx <= 0) {continue;}
|
||||
const key = line.slice(0, idx).trim();
|
||||
const value = line.slice(idx + 1).trim();
|
||||
result[key] = value;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function parseInteger(input: string | undefined): number | null {
|
||||
if (!input) {return null;}
|
||||
const parsed = Number(input);
|
||||
if (!Number.isFinite(parsed)) {return null;}
|
||||
if (parsed <= 0) {return null;}
|
||||
return Math.floor(parsed);
|
||||
}
|
||||
|
||||
function mapSystemdStatus(activeState: string, error?: string): {
|
||||
status: ObservabilitySourceStatus;
|
||||
stateCode: number;
|
||||
healthCode: number;
|
||||
} {
|
||||
if (error) {
|
||||
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
|
||||
const state = activeState.trim().toLowerCase();
|
||||
if (state === 'active') {
|
||||
return { status: 'running', stateCode: STATE_RUNNING, healthCode: HEALTH_HEALTHY };
|
||||
}
|
||||
if (state === 'inactive' || state === 'dead') {
|
||||
return { status: 'stopped', stateCode: STATE_STOPPED, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
if (state === 'failed') {
|
||||
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
||||
}
|
||||
if (state === 'activating' || state === 'deactivating' || state === 'reloading') {
|
||||
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
||||
}
|
||||
if (state === 'not-found') {
|
||||
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
return { status: 'unknown', stateCode: STATE_DEGRADED, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
|
||||
function mapDockerStatus(stateRaw: string, healthRaw: string, error?: string): {
|
||||
status: ObservabilitySourceStatus;
|
||||
stateCode: number;
|
||||
healthCode: number;
|
||||
} {
|
||||
if (error) {
|
||||
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
|
||||
const state = stateRaw.trim().toLowerCase();
|
||||
const health = healthRaw.trim().toLowerCase();
|
||||
|
||||
if (state === 'running') {
|
||||
if (health === 'healthy' || health === 'none' || health === 'unknown') {
|
||||
return {
|
||||
status: 'running',
|
||||
stateCode: STATE_RUNNING,
|
||||
healthCode: health === 'healthy' ? HEALTH_HEALTHY : HEALTH_UNKNOWN,
|
||||
};
|
||||
}
|
||||
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
||||
}
|
||||
|
||||
if (state === 'stopped' || state === 'not-created' || state === 'created') {
|
||||
return { status: 'stopped', stateCode: STATE_STOPPED, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
|
||||
if (state === 'unavailable' || state === 'not-found') {
|
||||
return { status: 'unavailable', stateCode: STATE_UNAVAILABLE, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
|
||||
if (state === 'restarting' || state === 'paused') {
|
||||
return { status: 'degraded', stateCode: STATE_DEGRADED, healthCode: HEALTH_DEGRADED };
|
||||
}
|
||||
|
||||
return { status: 'unknown', stateCode: STATE_DEGRADED, healthCode: HEALTH_UNKNOWN };
|
||||
}
|
||||
|
||||
function classifyLogLevel(text: string): 'info' | 'warn' | 'error' {
|
||||
const lower = text.toLowerCase();
|
||||
if (lower.includes('error') || lower.includes('failed') || lower.includes('fatal') || lower.includes('panic')) {
|
||||
return 'error';
|
||||
}
|
||||
if (lower.includes('warn') || lower.includes('warning') || lower.includes('degraded')) {
|
||||
return 'warn';
|
||||
}
|
||||
return 'info';
|
||||
}
|
||||
|
||||
function normalizeOffset(ts: string): string {
|
||||
return ts.replace(/([+-]\d{2})(\d{2})$/, '$1:$2');
|
||||
}
|
||||
|
||||
function parseTimestamp(raw: string | undefined): number | undefined {
|
||||
if (!raw) {return undefined;}
|
||||
const candidates = [
|
||||
raw,
|
||||
normalizeOffset(raw),
|
||||
raw.includes(' ') ? raw.replace(' ', 'T') : raw,
|
||||
normalizeOffset(raw.includes(' ') ? raw.replace(' ', 'T') : raw),
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
const parsed = Date.parse(candidate);
|
||||
if (!Number.isNaN(parsed)) {
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function sanitizeWindowMinutes(value: number | undefined): number {
|
||||
const parsed = Math.floor(Number(value ?? DEFAULT_WINDOW_MINUTES));
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return DEFAULT_WINDOW_MINUTES;
|
||||
}
|
||||
return Math.min(MAX_WINDOW_MINUTES, parsed);
|
||||
}
|
||||
|
||||
function sanitizeBucketSeconds(value: number | undefined): number {
|
||||
const parsed = Math.floor(Number(value ?? DEFAULT_BUCKET_SECONDS));
|
||||
if ((ALLOWED_BUCKET_SECONDS as readonly number[]).includes(parsed)) {
|
||||
return parsed;
|
||||
}
|
||||
return DEFAULT_BUCKET_SECONDS;
|
||||
}
|
||||
|
||||
function sanitizeLogLines(value: number | undefined): number {
|
||||
const parsed = Math.floor(Number(value ?? DEFAULT_LOG_LINES));
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return DEFAULT_LOG_LINES;
|
||||
}
|
||||
return Math.min(MAX_LOG_LINES, parsed);
|
||||
}
|
||||
|
||||
function sanitizeSinceSeconds(value: number | undefined): number {
|
||||
const parsed = Math.floor(Number(value ?? DEFAULT_LOG_SINCE_SECONDS));
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return DEFAULT_LOG_SINCE_SECONDS;
|
||||
}
|
||||
return Math.min(MAX_LOG_SINCE_SECONDS, parsed);
|
||||
}
|
||||
|
||||
function normalizeSourceId(value: string): string {
|
||||
return value.trim();
|
||||
}
|
||||
|
||||
function redactLogText(text: string): { text: string; redacted: boolean } {
|
||||
const result = redactForAudit(text);
|
||||
return {
|
||||
text: typeof result.value === 'string' ? result.value : String(result.value),
|
||||
redacted: result.redactions > 0,
|
||||
};
|
||||
}
|
||||
|
||||
function splitDockerLogLine(line: string): { ts?: number; level: 'info' | 'warn' | 'error'; text: string; redacted: boolean } {
|
||||
let content = line.trim();
|
||||
const prefixed = content.match(/^([A-Za-z0-9_.-]+)\s+\|\s+(.*)$/);
|
||||
if (prefixed) {
|
||||
content = prefixed[2]?.trim() ?? content;
|
||||
}
|
||||
|
||||
const timestamped = content.match(/^(\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d(?:\.\d+)?(?:Z|[+-]\d\d:?\d\d))\s+(.*)$/);
|
||||
let ts: number | undefined;
|
||||
if (timestamped) {
|
||||
ts = parseTimestamp(timestamped[1]);
|
||||
content = timestamped[2] ?? content;
|
||||
}
|
||||
|
||||
const redacted = redactLogText(content);
|
||||
return {
|
||||
ts,
|
||||
level: classifyLogLevel(redacted.text),
|
||||
text: redacted.text,
|
||||
redacted: redacted.redacted,
|
||||
};
|
||||
}
|
||||
|
||||
function splitJournalLine(line: string): { ts?: number; level: 'info' | 'warn' | 'error'; text: string; redacted: boolean } {
|
||||
let content = line.trim();
|
||||
let ts: number | undefined;
|
||||
|
||||
const timestamped = content.match(/^([0-9T:\-+. ]{19,})(\s+.*)$/);
|
||||
if (timestamped) {
|
||||
const parsedTs = parseTimestamp(timestamped[1]?.trim());
|
||||
if (parsedTs !== undefined) {
|
||||
ts = parsedTs;
|
||||
content = timestamped[2]?.trim() ?? content;
|
||||
}
|
||||
}
|
||||
|
||||
const redacted = redactLogText(content);
|
||||
return {
|
||||
ts,
|
||||
level: classifyLogLevel(redacted.text),
|
||||
text: redacted.text,
|
||||
redacted: redacted.redacted,
|
||||
};
|
||||
}
|
||||
|
||||
interface SystemdStatus {
|
||||
unit: string;
|
||||
name: string;
|
||||
loadState: string;
|
||||
activeState: string;
|
||||
subState: string;
|
||||
statusText: string;
|
||||
pid: number | null;
|
||||
result: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function fetchSystemdUnitStatus(
|
||||
runner: CommandRunner,
|
||||
opts: { unit: string; name: string; user: boolean },
|
||||
): Promise<SystemdStatus> {
|
||||
const args = [
|
||||
...(opts.user ? ['--user'] : []),
|
||||
'show',
|
||||
opts.unit,
|
||||
'--property=LoadState,ActiveState,SubState,Description,ExecMainPID,Result',
|
||||
'--no-pager',
|
||||
];
|
||||
|
||||
try {
|
||||
const response = await runner('systemctl', args, {
|
||||
timeoutMs: DEFAULT_TIMEOUT_MS,
|
||||
maxBufferBytes: 1024 * 1024,
|
||||
});
|
||||
const parsed = parseKeyValueOutput(response.stdout);
|
||||
const loadState = parsed.LoadState || 'unknown';
|
||||
const activeState = parsed.ActiveState || 'unknown';
|
||||
const subState = parsed.SubState || 'unknown';
|
||||
const description = parsed.Description || opts.name;
|
||||
const pid = parseInteger(parsed.ExecMainPID);
|
||||
const result = parsed.Result || 'unknown';
|
||||
return {
|
||||
unit: opts.unit,
|
||||
name: description,
|
||||
loadState,
|
||||
activeState,
|
||||
subState,
|
||||
statusText: activeState === subState ? activeState : `${activeState}/${subState}`,
|
||||
pid,
|
||||
result,
|
||||
};
|
||||
} catch (error) {
|
||||
const detail = normalizeError(error);
|
||||
return {
|
||||
unit: opts.unit,
|
||||
name: opts.name,
|
||||
loadState: 'unknown',
|
||||
activeState: detail.toLowerCase().includes('not found') ? 'not-found' : 'unknown',
|
||||
subState: 'unknown',
|
||||
statusText: 'unavailable',
|
||||
pid: null,
|
||||
result: 'unknown',
|
||||
error: detail,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function discoverComposeProfileArgs(runner: CommandRunner): Promise<string[]> {
|
||||
try {
|
||||
const response = await runner('docker', ['compose', '-f', COMPOSE_FILE, 'config', '--profiles'], {
|
||||
timeoutMs: DEFAULT_TIMEOUT_MS,
|
||||
maxBufferBytes: LARGE_BUFFER_BYTES,
|
||||
});
|
||||
const profiles = response.stdout
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line.length > 0);
|
||||
|
||||
const unique = new Set<string>();
|
||||
for (const profile of profiles) {
|
||||
unique.add(profile);
|
||||
}
|
||||
const args: string[] = [];
|
||||
for (const profile of unique) {
|
||||
args.push('--profile', profile);
|
||||
}
|
||||
return args;
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export interface ObservabilityCollectorOptions {
|
||||
config: Config;
|
||||
flynnSystemdUnit?: string;
|
||||
samplingIntervalMs?: number;
|
||||
maxSamplesPerSource?: number;
|
||||
now?: () => number;
|
||||
runner?: CommandRunner;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects bounded service status samples and log snapshots for the web dashboard.
|
||||
*/
|
||||
export class ObservabilityCollector {
|
||||
private readonly config: Config;
|
||||
private readonly flynnSystemdUnit: string;
|
||||
private readonly samplingIntervalMs: number;
|
||||
private readonly maxSamplesPerSource: number;
|
||||
private readonly now: () => number;
|
||||
private readonly runner: CommandRunner;
|
||||
|
||||
private sampleTimer: NodeJS.Timeout | null = null;
|
||||
private inFlightSample: Promise<void> | null = null;
|
||||
private readonly sourceHistory = new Map<string, SampleRecord[]>();
|
||||
private readonly sourceMeta = new Map<string, ObservabilitySource>();
|
||||
private readonly sourceCounters = new Map<string, SourceCounter>();
|
||||
|
||||
constructor(options: ObservabilityCollectorOptions) {
|
||||
this.config = options.config;
|
||||
this.flynnSystemdUnit = options.flynnSystemdUnit ?? DEFAULT_FLYNN_UNIT;
|
||||
this.samplingIntervalMs = Math.max(5_000, Math.floor(options.samplingIntervalMs ?? DEFAULT_SAMPLE_INTERVAL_MS));
|
||||
this.maxSamplesPerSource = Math.max(60, Math.floor(options.maxSamplesPerSource ?? DEFAULT_MAX_SAMPLES));
|
||||
this.now = options.now ?? (() => Date.now());
|
||||
this.runner = options.runner ?? defaultRunner;
|
||||
}
|
||||
|
||||
start(): void {
|
||||
if (this.sampleTimer) {
|
||||
return;
|
||||
}
|
||||
void this.forceSample();
|
||||
this.sampleTimer = setInterval(() => {
|
||||
void this.forceSample();
|
||||
}, this.samplingIntervalMs);
|
||||
this.sampleTimer.unref?.();
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
if (!this.sampleTimer) {
|
||||
return;
|
||||
}
|
||||
clearInterval(this.sampleTimer);
|
||||
this.sampleTimer = null;
|
||||
}
|
||||
|
||||
async listSources(): Promise<ObservabilitySource[]> {
|
||||
await this.ensureSampled();
|
||||
return Array.from(this.sourceMeta.values())
|
||||
.sort((a, b) => a.name.localeCompare(b.name));
|
||||
}
|
||||
|
||||
async getSeries(query?: ObservabilitySeriesQuery): Promise<ObservabilitySeriesSnapshot> {
|
||||
await this.ensureSampled();
|
||||
|
||||
const now = this.now();
|
||||
const windowMinutes = sanitizeWindowMinutes(query?.windowMinutes);
|
||||
const bucketSeconds = sanitizeBucketSeconds(query?.bucketSeconds);
|
||||
const bucketMs = bucketSeconds * 1000;
|
||||
const lowerBound = now - (windowMinutes * 60_000);
|
||||
|
||||
const sourceFilter = query?.sourceIds && query.sourceIds.length > 0
|
||||
? new Set(query.sourceIds.map((id) => normalizeSourceId(id)).filter((id) => id.length > 0))
|
||||
: null;
|
||||
|
||||
const series: ObservabilitySeriesEntry[] = [];
|
||||
for (const [sourceId, history] of this.sourceHistory.entries()) {
|
||||
if (sourceFilter && !sourceFilter.has(sourceId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bucketMap = new Map<number, SampleRecord>();
|
||||
for (const sample of history) {
|
||||
if (sample.ts < lowerBound) {
|
||||
continue;
|
||||
}
|
||||
const bucketTs = Math.floor(sample.ts / bucketMs) * bucketMs;
|
||||
bucketMap.set(bucketTs, sample);
|
||||
}
|
||||
|
||||
const points = Array.from(bucketMap.entries())
|
||||
.sort((a, b) => a[0] - b[0])
|
||||
.map(([bucketTs, sample]) => ({
|
||||
ts: bucketTs,
|
||||
stateCode: sample.stateCode,
|
||||
healthCode: sample.healthCode,
|
||||
errorCount: sample.errorCount,
|
||||
restartCount: sample.restartCount,
|
||||
}));
|
||||
|
||||
series.push({ sourceId, points });
|
||||
}
|
||||
|
||||
series.sort((a, b) => {
|
||||
const left = this.sourceMeta.get(a.sourceId)?.name ?? a.sourceId;
|
||||
const right = this.sourceMeta.get(b.sourceId)?.name ?? b.sourceId;
|
||||
return left.localeCompare(right);
|
||||
});
|
||||
|
||||
return {
|
||||
generatedAt: now,
|
||||
windowMinutes,
|
||||
bucketSeconds,
|
||||
series,
|
||||
};
|
||||
}
|
||||
|
||||
async getServiceLogs(query: ServiceLogQuery): Promise<ServiceLogSnapshot> {
|
||||
await this.ensureSampled();
|
||||
|
||||
const sourceId = normalizeSourceId(query.sourceId);
|
||||
if (!sourceId) {
|
||||
throw new Error('sourceId is required');
|
||||
}
|
||||
|
||||
const source = this.sourceMeta.get(sourceId);
|
||||
if (!source || !source.logCapable) {
|
||||
throw new Error(`Log source not found or unavailable: ${sourceId}`);
|
||||
}
|
||||
|
||||
const lines = sanitizeLogLines(query.lines);
|
||||
const sinceSeconds = sanitizeSinceSeconds(query.sinceSeconds);
|
||||
|
||||
if (source.runtime === 'docker_compose') {
|
||||
return this.fetchDockerLogs(source, lines, sinceSeconds);
|
||||
}
|
||||
if (source.runtime === 'systemd_user' || source.runtime === 'systemd_system') {
|
||||
return this.fetchJournalLogs(source, lines, sinceSeconds);
|
||||
}
|
||||
|
||||
throw new Error(`Unsupported log runtime for source: ${sourceId}`);
|
||||
}
|
||||
|
||||
async forceSample(): Promise<void> {
|
||||
if (this.inFlightSample) {
|
||||
await this.inFlightSample;
|
||||
return;
|
||||
}
|
||||
|
||||
this.inFlightSample = this.collectSample()
|
||||
.catch(() => {
|
||||
// Keep sampling resilient; errors are reflected as unavailable source snapshots.
|
||||
})
|
||||
.finally(() => {
|
||||
this.inFlightSample = null;
|
||||
});
|
||||
|
||||
await this.inFlightSample;
|
||||
}
|
||||
|
||||
private async ensureSampled(): Promise<void> {
|
||||
if (this.sourceMeta.size > 0) {
|
||||
return;
|
||||
}
|
||||
await this.forceSample();
|
||||
}
|
||||
|
||||
private async collectSample(): Promise<void> {
|
||||
const sampleTime = this.now();
|
||||
|
||||
const [flynnResult, localBackendsResult, dockerDependenciesResult] = await Promise.allSettled([
|
||||
fetchSystemdUnitStatus(this.runner, {
|
||||
unit: this.flynnSystemdUnit,
|
||||
name: 'Flynn daemon',
|
||||
user: false,
|
||||
}),
|
||||
listLocalBackendStatuses(this.config, async (args: string[]) => {
|
||||
return this.runner('systemctl', args, {
|
||||
timeoutMs: DEFAULT_TIMEOUT_MS,
|
||||
maxBufferBytes: 1024 * 1024,
|
||||
});
|
||||
}),
|
||||
listDockerDependencyStatuses(this.config, async (args: string[]) => {
|
||||
return this.runner('docker', ['compose', '-f', COMPOSE_FILE, ...args], {
|
||||
timeoutMs: DEFAULT_TIMEOUT_MS,
|
||||
maxBufferBytes: LARGE_BUFFER_BYTES,
|
||||
});
|
||||
}),
|
||||
]);
|
||||
|
||||
const flynnStatus = flynnResult.status === 'fulfilled'
|
||||
? flynnResult.value
|
||||
: {
|
||||
unit: this.flynnSystemdUnit,
|
||||
name: 'Flynn daemon',
|
||||
loadState: 'unknown',
|
||||
activeState: 'unknown',
|
||||
subState: 'unknown',
|
||||
statusText: 'unavailable',
|
||||
pid: null,
|
||||
result: 'unknown',
|
||||
error: normalizeError(flynnResult.reason),
|
||||
};
|
||||
const localBackends = localBackendsResult.status === 'fulfilled'
|
||||
? localBackendsResult.value
|
||||
: [];
|
||||
const dockerDependencies = dockerDependenciesResult.status === 'fulfilled'
|
||||
? dockerDependenciesResult.value
|
||||
: [];
|
||||
|
||||
const snapshots: SourceSnapshot[] = [];
|
||||
|
||||
const flynnMapped = mapSystemdStatus(flynnStatus.activeState, flynnStatus.error);
|
||||
snapshots.push({
|
||||
source: {
|
||||
id: 'systemd:flynn',
|
||||
name: 'Flynn daemon',
|
||||
kind: 'systemd_system',
|
||||
runtime: 'systemd_system',
|
||||
status: flynnMapped.status,
|
||||
graphCapable: true,
|
||||
logCapable: true,
|
||||
metadata: {
|
||||
unit: this.flynnSystemdUnit,
|
||||
state: flynnStatus.activeState,
|
||||
statusText: flynnStatus.statusText,
|
||||
},
|
||||
},
|
||||
stateCode: flynnMapped.stateCode,
|
||||
healthCode: flynnMapped.healthCode,
|
||||
hasError: Boolean(flynnStatus.error),
|
||||
fingerprint: flynnStatus.pid ? `pid:${flynnStatus.pid}` : null,
|
||||
});
|
||||
|
||||
for (const backend of localBackends) {
|
||||
const mapped = mapSystemdStatus(String(backend.activeState ?? ''), backend.error);
|
||||
snapshots.push({
|
||||
source: {
|
||||
id: `systemd-user:${backend.id}`,
|
||||
name: backend.name,
|
||||
kind: 'systemd_user',
|
||||
runtime: 'systemd_user',
|
||||
status: mapped.status,
|
||||
graphCapable: true,
|
||||
logCapable: backend.loadState !== 'not-found',
|
||||
metadata: {
|
||||
unit: backend.unit,
|
||||
state: backend.activeState,
|
||||
statusText: backend.statusText,
|
||||
},
|
||||
},
|
||||
stateCode: mapped.stateCode,
|
||||
healthCode: mapped.healthCode,
|
||||
hasError: Boolean(backend.error) || backend.activeState === 'failed',
|
||||
fingerprint: backend.pid ? `pid:${backend.pid}` : null,
|
||||
});
|
||||
}
|
||||
|
||||
for (const dependency of dockerDependencies) {
|
||||
const mapped = mapDockerStatus(dependency.state, dependency.health, dependency.error);
|
||||
snapshots.push({
|
||||
source: {
|
||||
id: `docker:${dependency.id}`,
|
||||
name: dependency.name,
|
||||
kind: 'docker_dependency',
|
||||
runtime: 'docker_compose',
|
||||
status: mapped.status,
|
||||
graphCapable: true,
|
||||
logCapable: dependency.id !== 'compose',
|
||||
metadata: {
|
||||
service: dependency.service,
|
||||
state: dependency.state,
|
||||
health: dependency.health,
|
||||
statusText: dependency.statusText,
|
||||
containerName: dependency.containerName,
|
||||
},
|
||||
},
|
||||
stateCode: mapped.stateCode,
|
||||
healthCode: mapped.healthCode,
|
||||
hasError: Boolean(dependency.error) || mapped.status === 'degraded' || mapped.status === 'unavailable',
|
||||
fingerprint: dependency.containerName ? `container:${dependency.containerName}` : null,
|
||||
});
|
||||
}
|
||||
|
||||
const seenSourceIds = new Set<string>();
|
||||
for (const snapshot of snapshots) {
|
||||
seenSourceIds.add(snapshot.source.id);
|
||||
this.recordSourceSample(snapshot, sampleTime);
|
||||
}
|
||||
|
||||
// Keep stale entries visible as unavailable when they disappear.
|
||||
for (const [sourceId, source] of this.sourceMeta.entries()) {
|
||||
if (seenSourceIds.has(sourceId)) {
|
||||
continue;
|
||||
}
|
||||
this.recordSourceSample({
|
||||
source: {
|
||||
...source,
|
||||
status: 'unavailable',
|
||||
},
|
||||
stateCode: STATE_UNAVAILABLE,
|
||||
healthCode: HEALTH_UNKNOWN,
|
||||
hasError: true,
|
||||
fingerprint: null,
|
||||
}, sampleTime);
|
||||
}
|
||||
}
|
||||
|
||||
private recordSourceSample(snapshot: SourceSnapshot, timestamp: number): void {
|
||||
this.sourceMeta.set(snapshot.source.id, snapshot.source);
|
||||
|
||||
const counter = this.sourceCounters.get(snapshot.source.id) ?? {
|
||||
errorCount: 0,
|
||||
restartCount: 0,
|
||||
lastStateCode: snapshot.stateCode,
|
||||
lastFingerprint: snapshot.fingerprint,
|
||||
hasPrevious: false,
|
||||
} satisfies SourceCounter;
|
||||
|
||||
if (snapshot.hasError) {
|
||||
counter.errorCount += 1;
|
||||
}
|
||||
|
||||
if (counter.hasPrevious) {
|
||||
const enteredRunning = counter.lastStateCode !== STATE_RUNNING && snapshot.stateCode === STATE_RUNNING;
|
||||
const fingerprintChanged = (
|
||||
snapshot.stateCode === STATE_RUNNING
|
||||
&& Boolean(snapshot.fingerprint)
|
||||
&& Boolean(counter.lastFingerprint)
|
||||
&& snapshot.fingerprint !== counter.lastFingerprint
|
||||
);
|
||||
if (enteredRunning || fingerprintChanged) {
|
||||
counter.restartCount += 1;
|
||||
}
|
||||
}
|
||||
|
||||
counter.lastStateCode = snapshot.stateCode;
|
||||
counter.lastFingerprint = snapshot.fingerprint;
|
||||
counter.hasPrevious = true;
|
||||
this.sourceCounters.set(snapshot.source.id, counter);
|
||||
|
||||
const records = this.sourceHistory.get(snapshot.source.id) ?? [];
|
||||
records.push({
|
||||
ts: timestamp,
|
||||
stateCode: snapshot.stateCode,
|
||||
healthCode: snapshot.healthCode,
|
||||
errorCount: counter.errorCount,
|
||||
restartCount: counter.restartCount,
|
||||
});
|
||||
|
||||
if (records.length > this.maxSamplesPerSource) {
|
||||
records.splice(0, records.length - this.maxSamplesPerSource);
|
||||
}
|
||||
|
||||
this.sourceHistory.set(snapshot.source.id, records);
|
||||
}
|
||||
|
||||
private async fetchDockerLogs(
|
||||
source: ObservabilitySource,
|
||||
lines: number,
|
||||
sinceSeconds: number,
|
||||
): Promise<ServiceLogSnapshot> {
|
||||
const service = source.metadata?.service;
|
||||
if (!service) {
|
||||
throw new Error(`Missing docker compose service metadata for source: ${source.id}`);
|
||||
}
|
||||
|
||||
const profileArgs = await discoverComposeProfileArgs(this.runner);
|
||||
const response = await this.runner(
|
||||
'docker',
|
||||
[
|
||||
'compose',
|
||||
'-f',
|
||||
COMPOSE_FILE,
|
||||
...profileArgs,
|
||||
'logs',
|
||||
'--timestamps',
|
||||
'--tail',
|
||||
String(lines),
|
||||
'--since',
|
||||
`${sinceSeconds}s`,
|
||||
service,
|
||||
],
|
||||
{ timeoutMs: LARGE_TIMEOUT_MS, maxBufferBytes: LARGE_BUFFER_BYTES },
|
||||
);
|
||||
|
||||
const rawLines = response.stdout
|
||||
.split('\n')
|
||||
.map((line) => line.trimEnd())
|
||||
.filter((line) => line.trim().length > 0);
|
||||
|
||||
let redacted = false;
|
||||
const parsed: ServiceLogEntry[] = rawLines.map((line) => {
|
||||
const item = splitDockerLogLine(line);
|
||||
redacted = redacted || item.redacted;
|
||||
return {
|
||||
ts: item.ts,
|
||||
level: item.level,
|
||||
text: item.text,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
sourceId: source.id,
|
||||
fetchedAt: this.now(),
|
||||
redacted,
|
||||
lines: parsed,
|
||||
truncated: parsed.length >= lines,
|
||||
};
|
||||
}
|
||||
|
||||
private async fetchJournalLogs(
|
||||
source: ObservabilitySource,
|
||||
lines: number,
|
||||
sinceSeconds: number,
|
||||
): Promise<ServiceLogSnapshot> {
|
||||
const unit = source.metadata?.unit;
|
||||
if (!unit) {
|
||||
throw new Error(`Missing systemd unit metadata for source: ${source.id}`);
|
||||
}
|
||||
|
||||
const user = source.runtime === 'systemd_user';
|
||||
const response = await this.runner(
|
||||
'journalctl',
|
||||
[
|
||||
...(user ? ['--user'] : []),
|
||||
'-u',
|
||||
unit,
|
||||
'--since',
|
||||
`${sinceSeconds} seconds ago`,
|
||||
'--no-pager',
|
||||
'--output',
|
||||
'short-iso-precise',
|
||||
'-n',
|
||||
String(lines),
|
||||
],
|
||||
{
|
||||
timeoutMs: LARGE_TIMEOUT_MS,
|
||||
maxBufferBytes: LARGE_BUFFER_BYTES,
|
||||
},
|
||||
);
|
||||
|
||||
const rawLines = response.stdout
|
||||
.split('\n')
|
||||
.map((line) => line.trimEnd())
|
||||
.filter((line) => line.trim().length > 0);
|
||||
|
||||
let redacted = false;
|
||||
const parsed: ServiceLogEntry[] = rawLines.map((line) => {
|
||||
const item = splitJournalLine(line);
|
||||
redacted = redacted || item.redacted;
|
||||
return {
|
||||
ts: item.ts,
|
||||
level: item.level,
|
||||
text: item.text,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
sourceId: source.id,
|
||||
fetchedAt: this.now(),
|
||||
redacted,
|
||||
lines: parsed,
|
||||
truncated: parsed.length >= lines,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export const observabilityDefaults = {
|
||||
DEFAULT_WINDOW_MINUTES,
|
||||
MAX_WINDOW_MINUTES,
|
||||
DEFAULT_BUCKET_SECONDS,
|
||||
ALLOWED_BUCKET_SECONDS: [...ALLOWED_BUCKET_SECONDS],
|
||||
DEFAULT_LOG_LINES,
|
||||
MAX_LOG_LINES,
|
||||
DEFAULT_LOG_SINCE_SECONDS,
|
||||
MAX_LOG_SINCE_SECONDS,
|
||||
} as const;
|
||||
Reference in New Issue
Block a user