maestro/src/bridge/dashboard-workers.ts
2026-06-03 05:08:00 +00:00

122 lines
4.8 KiB
TypeScript

import type { Repository } from '../db/repository.js';
import type { WorkerDef } from '../config.js';
import type { BackendStatusRegistry, NodeStatus } from '../engine/backend-status-registry.js';
export interface WorkerStatusBackendRow {
/** Stable identifier from the upstream /health response (deployment id). */
id: string;
/** idle = registry says zero busy slots, running = at least one in-flight. */
state: 'idle' | 'running';
/** Number of in-flight slots reported by the registry. */
busySlots: number;
/** Total slot capacity. 0 when the registry hasn't probed yet. */
totalSlots: number;
/** false when the most recent /health probe failed. null when unprobed. */
online: boolean | null;
}
export interface WorkerStatusRow {
id: string;
name: string;
roles: string[];
state: 'idle' | 'running';
/** True when this row represents a `proxy: true` worker (LiteLLM / AAO Gateway). */
proxy: boolean;
/**
* Slot pressure from the BackendStatusRegistry, populated for direct
* workers when the registry has seen at least one probe cycle and a
* matching `nodeId === worker.id` row exists. Proxy workers leave
* these undefined — the meaningful breakdown lives in `backends[]`
* (per-backend, since the proxy itself doesn't have its own
* /slots-style busy figure).
*/
busySlots?: number;
totalSlots?: number;
/** Probe liveness. Same gating as busySlots/totalSlots. */
online?: boolean;
/**
* Per-backend rows for proxy workers — populated when a
* BackendStatusRegistry is wired and the registry has seen at least
* one probe cycle for this worker. Omitted (undefined, not empty)
* for direct workers so the UI can distinguish "no backends because
* this is a direct worker" from "proxy worker with zero backends
* reported".
*/
backends?: WorkerStatusBackendRow[];
}
/**
* Build the per-worker status list for the Side Info Panel.
*
* Proxy workers fan out into a `backends[]` list when a
* `BackendStatusRegistry` is supplied — the Worker widget renders the
* tree at the same granularity as the Node Status widget, so an
* operator can see *which* backend behind a LiteLLM / AAO Gateway
* front is currently in use rather than just "the proxy is busy".
*
* Privacy: returns idle/running booleans + slot counts only. Never job
* ids, titles, or owners, since the panel is shown to all users in a
* multi-tenant deployment.
*/
export async function collectWorkerStatuses(
repo: Repository,
workers: WorkerDef[],
registry: Pick<BackendStatusRegistry, 'getAll'> | null = null,
): Promise<WorkerStatusRow[]> {
// Build a workerId → NodeStatus[] map once per call so we don't
// O(N*M) the registry snapshot per worker. registry.getAll() copies
// its internal cache, so calling it once is cheap.
const byWorker = new Map<string, NodeStatus[]>();
if (registry) {
for (const ns of registry.getAll()) {
const list = byWorker.get(ns.workerId);
if (list) list.push(ns);
else byWorker.set(ns.workerId, [ns]);
}
}
return workers.map((w) => {
const isProxy = w.proxy === true;
const row: WorkerStatusRow = {
id: w.id,
name: w.id,
roles: w.roles ?? [],
state: repo.isWorkerBusy(w.id) ? 'running' : 'idle',
proxy: isProxy,
};
if (isProxy && registry) {
// Filter to backend-source rows only — the registry also stores a
// self-row for the proxy worker itself (source='proxy', nodeId =
// workerId) which would otherwise show up duplicated as a child
// of itself.
const rows = (byWorker.get(w.id) ?? []).filter((ns) => ns.nodeId !== w.id);
row.backends = rows.map((ns) => ({
id: ns.nodeId,
state: ns.busySlots > 0 ? 'running' : 'idle',
busySlots: ns.busySlots,
totalSlots: ns.totalSlots,
online: ns.online,
}));
} else if (!isProxy && registry) {
// Direct workers: the registry stores one row keyed by the
// worker id (source='direct', populated from llama-server
// /slots). Surface its slot pressure at the row level so the
// UI can render `(busy/total)` next to the state badge — same
// signal proxy backends get, just one level higher in the
// tree because direct workers have no expansion.
const selfRow = (byWorker.get(w.id) ?? []).find((ns) => ns.nodeId === w.id);
if (selfRow) {
row.busySlots = selfRow.busySlots;
row.totalSlots = selfRow.totalSlots;
row.online = selfRow.online;
// Re-derive state from the probe too — it sees in-flight
// requests that didn't go through the local jobs table
// (e.g. anything dispatched outside AAO). `repo.isWorkerBusy`
// alone misses those.
if (selfRow.busySlots > 0) row.state = 'running';
}
}
return row;
});
}