maestro/src/worker/sticky-backend.ts
oss-sync 25c087067a
Some checks failed
CI / build-and-test (push) Has been cancelled
sync: update from private repo (c8b6d29)
2026-06-10 09:09:39 +00:00

83 lines
3.2 KiB
TypeScript

/**
* Backend tracker for proxy worker jobs.
*
* History: originally "first backend wins" (2026-05-18 design, Open Question
* #3 case 1) to keep the UI pet from flickering while the gateway rebalanced
* every request. As of 2026-06 the gateway honors `x-aao-preferred-backend`
* (client-side sticky routing for KV-cache reuse), so backend switches are
* RARE — they only happen when the preferred backend goes offline or
* saturates. The tracker therefore now follows the CURRENT backend:
*
* - `jobs.last_backend_id` is updated whenever the resolved backend CHANGES,
* so the UI (pet, badges) tracks where the job actually runs.
* - Persistence happens via `updateJob({ lastBackendId })`. If that DB write
* FAILS, the in-memory value is left unchanged so the next
* `onBackendResolved` event retries the persist (a transient DB error must
* not permanently lose the worker → backend mapping).
*
* This module isolates the "advance only after persist succeeds" invariant
* from `Worker.buildPieceCallbacks`, which already has a dozen other
* concerns and is hard to unit-test in isolation.
*/
export interface StickyBackendLogger {
debug: (msg: string) => void;
info: (msg: string) => void;
warn: (msg: string) => void;
}
export interface StickyBackendEvent {
backendId: string;
cacheKey: string | null;
}
export interface BackendTracker {
/** The onBackendResolved callback for the agent loop (fire-and-forget safe). */
onEvent: (event: StickyBackendEvent) => Promise<void>;
/**
* The most recently persisted backend id (or the initial DB value).
* Used as the `x-aao-preferred-backend` hint on the next LLM request.
*/
current: () => string | null;
}
/**
* Build the backend tracker. `persist(backendId)` is the DB write
* (typically `repo.updateJob(jobId, { lastBackendId })`); it must reject on
* failure so the in-memory value stays put for retry.
*/
export function createStickyBackendResolver(opts: {
initial: string | null;
persist: (backendId: string) => Promise<void>;
logger: StickyBackendLogger;
workerId: string;
jobId: string;
}): BackendTracker {
const { initial, persist, logger, workerId, jobId } = opts;
let current: string | null = initial;
async function onEvent({ backendId, cacheKey }: StickyBackendEvent): Promise<void> {
if (current === backendId) return; // unchanged — nothing to persist
try {
await persist(backendId);
// Only advance AFTER persist succeeds. If we advanced first and the
// persist failed, the next identical event would short-circuit on the
// equality check and the DB would stay stale forever.
const previous = current;
current = backendId;
logger.info(
previous
? `[worker:${workerId}] job ${jobId} backend switched: ${previous}${backendId} cache=${cacheKey ?? 'miss'}`
: `[worker:${workerId}] job ${jobId} backend resolved: ${backendId} cache=${cacheKey ?? 'miss'}`,
);
} catch (err) {
logger.warn(
`[worker:${workerId}] failed to persist lastBackendId for job ${jobId}: ${err} — keeping ${current ?? 'unset'} for retry`,
);
// Intentionally do NOT advance. Next event retries.
}
}
return { onEvent, current: () => current };
}