maestro/src/engine/backend-status-registry.test.ts
2026-06-03 05:08:00 +00:00

434 lines
16 KiB
TypeScript

import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import {
createBackendStatusRegistry,
type NodeStatus,
type ProbeContext,
} from './backend-status-registry.js';
import type { WorkerDef } from '../config.js';
function makeStatus(partial: Partial<NodeStatus> & { nodeId: string; workerId: string; source: 'direct' | 'proxy' }): NodeStatus {
return {
online: true,
busy: false,
busySlots: 0,
totalSlots: 1,
loadedModel: null,
throughputTps: null,
lastSeen: '2026-05-18T00:00:00.000Z',
...partial,
};
}
function fixedClock(): () => string {
return () => '2026-05-18T00:00:00.000Z';
}
describe('createBackendStatusRegistry', () => {
beforeEach(() => {
vi.useFakeTimers();
});
afterEach(() => {
vi.useRealTimers();
});
it('probes direct workers on start and exposes the snapshot via getAll', async () => {
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1', model: 'qwen' }];
const probeDirect = vi.fn().mockResolvedValue(makeStatus({
nodeId: 'w1', workerId: 'w1', source: 'direct', loadedModel: 'qwen',
}));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 60_000,
now: fixedClock(),
});
reg.start();
await reg.refresh();
const snap = reg.getAll();
expect(snap).toHaveLength(1);
expect(snap[0]!.nodeId).toBe('w1');
expect(snap[0]!.loadedModel).toBe('qwen');
await reg.stop();
});
it('expands proxy workers into multiple backends', async () => {
const workers: WorkerDef[] = [{ id: 'pool', endpoint: 'http://litellm', proxy: true }];
const probeProxy = vi.fn().mockResolvedValue([
makeStatus({ nodeId: 'gpu-a', workerId: 'pool', source: 'proxy', loadedModel: 'qwen3:8b' }),
makeStatus({ nodeId: 'gpu-b', workerId: 'pool', source: 'proxy', loadedModel: 'qwen3:32b' }),
]);
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect: vi.fn(),
probeProxy,
pollIntervalMs: 60_000,
now: fixedClock(),
});
reg.start();
await reg.refresh();
const snap = reg.getAll();
expect(snap.map(s => s.nodeId).sort()).toEqual(['gpu-a', 'gpu-b']);
expect(snap.every(s => s.workerId === 'pool')).toBe(true);
await reg.stop();
});
it('isolates probe failures: one node failing does not affect others', async () => {
const workers: WorkerDef[] = [
{ id: 'w1', endpoint: 'http://w1' },
{ id: 'w2', endpoint: 'http://w2' },
];
const probeDirect = vi.fn().mockImplementation(async (w: WorkerDef) => {
if (w.id === 'w1') throw new Error('boom');
return makeStatus({ nodeId: 'w2', workerId: 'w2', source: 'direct' });
});
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 60_000,
now: fixedClock(),
});
reg.start();
await reg.refresh();
const snap = reg.getAll();
expect(snap).toHaveLength(2);
const w1 = snap.find(s => s.nodeId === 'w1')!;
const w2 = snap.find(s => s.nodeId === 'w2')!;
expect(w1.online).toBe(false);
expect(w1.lastProbeError).toBe('boom');
expect(w2.online).toBe(true);
expect(w2.lastProbeError).toBeUndefined();
await reg.stop();
});
it('subscribe() delivers current snapshot synchronously and on each tick', async () => {
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
let count = 0;
const probeDirect = vi.fn().mockImplementation(async () => {
count++;
return makeStatus({ nodeId: 'w1', workerId: 'w1', source: 'direct', busySlots: count });
});
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 60_000,
now: fixedClock(),
});
reg.start();
await reg.refresh();
const seen: number[] = [];
const unsub = reg.subscribe(snap => {
seen.push(snap[0]?.busySlots ?? -1);
});
// Synchronous delivery
expect(seen).toEqual([1]);
await reg.refresh();
expect(seen).toEqual([1, 2]);
unsub();
await reg.refresh();
// After unsubscribe, no further deliveries
expect(seen).toEqual([1, 2]);
await reg.stop();
});
it('skips overlapping ticks rather than stacking', async () => {
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
let resolveFirst: (() => void) | null = null;
const probeDirect = vi.fn().mockImplementation(() => new Promise<NodeStatus>(resolve => {
resolveFirst = () => resolve(makeStatus({ nodeId: 'w1', workerId: 'w1', source: 'direct' }));
}));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 1000,
now: fixedClock(),
});
reg.start();
// Two parallel refresh calls should share the same inflight
const a = reg.refresh();
const b = reg.refresh();
resolveFirst!();
await Promise.all([a, b]);
expect(probeDirect).toHaveBeenCalledTimes(1);
await reg.stop();
});
it('respects maxConcurrency when probing many workers', async () => {
vi.useRealTimers();
const workers: WorkerDef[] = Array.from({ length: 6 }, (_, i) => ({ id: `w${i}`, endpoint: `http://w${i}` }));
let active = 0;
let maxActive = 0;
const probeDirect = vi.fn().mockImplementation(async (w: WorkerDef) => {
active++;
maxActive = Math.max(maxActive, active);
await new Promise(r => setTimeout(r, 5));
active--;
return makeStatus({ nodeId: w.id, workerId: w.id, source: 'direct' });
});
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 60_000,
maxConcurrency: 2,
now: fixedClock(),
});
reg.start();
await reg.refresh();
expect(maxActive).toBeLessThanOrEqual(2);
expect(reg.getAll()).toHaveLength(6);
await reg.stop();
});
it('stop() aborts in-flight probes and resolves promptly (no shutdown hang)', async () => {
vi.useRealTimers();
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
// Probe that only resolves when its external AbortSignal fires —
// simulates an upstream that would otherwise wedge until per-probe
// timeout (3s in prod).
let aborts = 0;
const probeDirect = vi.fn().mockImplementation((_w: WorkerDef, ctx?: ProbeContext) =>
new Promise<NodeStatus>((_resolve, reject) => {
const sig = ctx?.signal;
if (!sig) {
reject(new Error('test expected a signal'));
return;
}
sig.addEventListener('abort', () => {
aborts++;
reject(new Error('aborted'));
}, { once: true });
}));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 60_000,
now: () => '2026-05-18T00:00:00.000Z',
});
reg.start();
// Don't await refresh — refresh() resolves only after the probe
// settles, and we want to confirm stop() drives that settlement.
const refreshPromise = reg.refresh().catch(() => { /* expected */ });
// Give the microtask queue a turn so runOnce attaches the abort listener.
await new Promise(r => setImmediate(r));
const before = Date.now();
await reg.stop();
const elapsed = Date.now() - before;
// stop() should not have waited the full per-probe timeout (3s in
// prod, but the test uses no timeout cap — without abort it would
// hang forever). 500ms is a generous upper bound.
expect(elapsed).toBeLessThan(500);
expect(aborts).toBe(1);
await refreshPromise;
});
it('refresh() shares the inflight cycle when called during the start() initial probe (race regression)', async () => {
// Regression for the "scheduleNext vs refresh inflight" race noted
// in PR #318 review. start() kicks off an immediate probe and
// assigns it to `inflight`; a refresh() call landing before that
// probe settles must reuse the same inflight promise rather than
// spawning a parallel runOnce — otherwise two probe cycles race to
// write `cache` and notify subscribers.
vi.useRealTimers();
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
let resolveProbe: ((s: NodeStatus) => void) | null = null;
const probeDirect = vi.fn().mockImplementation(() =>
new Promise<NodeStatus>(resolve => {
// Capture only the FIRST probe's resolver. If refresh() spawned
// a second runOnce, this mock would be invoked twice and the
// captured resolver would point at the second invocation,
// leaving the first cycle hanging — the test would time out.
if (!resolveProbe) {
resolveProbe = (s) => resolve(s);
} else {
// A duplicate invocation indicates the race fired; resolve
// with a marker so the assertion below catches it instead of
// hanging.
resolve(makeStatus({ nodeId: 'DUPLICATE', workerId: 'w1', source: 'direct' }));
}
}));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 60_000,
now: () => '2026-05-18T00:00:00.000Z',
});
reg.start();
// refresh() lands while the start()-initiated probe is still in flight.
const refreshPromise = reg.refresh();
// Let the runtime schedule both call sites.
await new Promise(r => setImmediate(r));
// Exactly one probe must have been issued: the initial start() one,
// shared by refresh().
expect(probeDirect).toHaveBeenCalledTimes(1);
resolveProbe!(makeStatus({ nodeId: 'w1', workerId: 'w1', source: 'direct' }));
await refreshPromise;
expect(probeDirect).toHaveBeenCalledTimes(1);
expect(reg.getAll().map(s => s.nodeId)).toEqual(['w1']);
await reg.stop();
});
describe('dynamic polling cadence', () => {
it('uses the active interval when at least one listener is subscribed', async () => {
vi.useFakeTimers();
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
const probeDirect = vi.fn().mockResolvedValue(makeStatus({ nodeId: 'w1', workerId: 'w1', source: 'direct' }));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 5_000,
idlePollIntervalMs: 60_000,
subscriberActiveWindowMs: 30_000,
now: fixedClock(),
monotonicNowMs: () => Date.now(),
});
reg.start();
// Drain the initial probe so we're sitting at the first
// scheduleNext setTimeout.
await vi.advanceTimersByTimeAsync(0);
await Promise.resolve();
const unsub = reg.subscribe(() => {});
const initialCalls = probeDirect.mock.calls.length;
// After 5s the active-band tick should fire.
await vi.advanceTimersByTimeAsync(5_001);
await Promise.resolve();
expect(probeDirect.mock.calls.length).toBeGreaterThan(initialCalls);
unsub();
await reg.stop();
});
it('falls back to the idle interval when no subscribers are active', async () => {
vi.useFakeTimers();
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
const probeDirect = vi.fn().mockResolvedValue(makeStatus({ nodeId: 'w1', workerId: 'w1', source: 'direct' }));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 5_000,
idlePollIntervalMs: 60_000,
subscriberActiveWindowMs: 30_000,
now: fixedClock(),
monotonicNowMs: () => Date.now(),
});
reg.start();
await vi.advanceTimersByTimeAsync(0);
await Promise.resolve();
const before = probeDirect.mock.calls.length;
// Advance just past the active interval but well short of idle.
// No subscribers ever, so the registry must NOT fire at 5s.
await vi.advanceTimersByTimeAsync(10_000);
await Promise.resolve();
expect(probeDirect.mock.calls.length).toBe(before);
// Now jump past the idle interval — one tick should fire.
await vi.advanceTimersByTimeAsync(60_000);
await Promise.resolve();
expect(probeDirect.mock.calls.length).toBe(before + 1);
await reg.stop();
});
it('noteSubscriberActivity() wakes the registry from idle to active cadence', async () => {
vi.useFakeTimers();
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
const probeDirect = vi.fn().mockResolvedValue(makeStatus({ nodeId: 'w1', workerId: 'w1', source: 'direct' }));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 5_000,
idlePollIntervalMs: 60_000,
subscriberActiveWindowMs: 30_000,
now: fixedClock(),
monotonicNowMs: () => Date.now(),
});
reg.start();
await vi.advanceTimersByTimeAsync(0);
await Promise.resolve();
const before = probeDirect.mock.calls.length;
// We're in idle band (no subscribers). Confirm by checking nothing
// ticked after 6s (well past active interval).
await vi.advanceTimersByTimeAsync(6_000);
await Promise.resolve();
expect(probeDirect.mock.calls.length).toBe(before);
// Note activity — the next tick should now be on the active band.
reg.noteSubscriberActivity!();
await vi.advanceTimersByTimeAsync(5_001);
await Promise.resolve();
expect(probeDirect.mock.calls.length).toBe(before + 1);
await reg.stop();
});
it('falls back to idle cadence after the active window elapses without activity', async () => {
vi.useFakeTimers();
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
const probeDirect = vi.fn().mockResolvedValue(makeStatus({ nodeId: 'w1', workerId: 'w1', source: 'direct' }));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 5_000,
idlePollIntervalMs: 60_000,
subscriberActiveWindowMs: 10_000,
now: fixedClock(),
monotonicNowMs: () => Date.now(),
});
reg.start();
await vi.advanceTimersByTimeAsync(0);
await Promise.resolve();
reg.noteSubscriberActivity!();
// First active tick at +5s (subscriber window still open).
await vi.advanceTimersByTimeAsync(5_001);
await Promise.resolve();
const afterFirst = probeDirect.mock.calls.length;
// Second active tick fires at +10s (lastSubscriberAt was at t=0;
// when this tick was *scheduled* at t=5s the window was still
// open, so it ran on active cadence). The cadence decision after
// that tick must drop to idle because the window has now closed.
await vi.advanceTimersByTimeAsync(5_001);
await Promise.resolve();
const afterSecond = probeDirect.mock.calls.length;
expect(afterSecond).toBe(afterFirst + 1);
// The next scheduled tick is on the idle band (60s). Advance the
// full active interval and verify no tick fired.
await vi.advanceTimersByTimeAsync(10_000);
await Promise.resolve();
expect(probeDirect.mock.calls.length).toBe(afterSecond);
// After the idle interval, the next tick fires.
await vi.advanceTimersByTimeAsync(60_000);
await Promise.resolve();
expect(probeDirect.mock.calls.length).toBe(afterSecond + 1);
await reg.stop();
});
});
it('getByNodeId returns the matching status or null', async () => {
const workers: WorkerDef[] = [{ id: 'w1', endpoint: 'http://w1' }];
const probeDirect = vi.fn().mockResolvedValue(makeStatus({
nodeId: 'w1', workerId: 'w1', source: 'direct',
}));
const reg = createBackendStatusRegistry({
getWorkers: () => workers,
probeDirect,
probeProxy: vi.fn(),
pollIntervalMs: 60_000,
now: fixedClock(),
});
reg.start();
await reg.refresh();
expect(reg.getByNodeId('w1')).not.toBeNull();
expect(reg.getByNodeId('does-not-exist')).toBeNull();
await reg.stop();
});
});