diff --git a/config.yaml.example b/config.yaml.example index fcd4ffd..c247004 100644 --- a/config.yaml.example +++ b/config.yaml.example @@ -117,14 +117,20 @@ llm: # shutdown_graceful_sec: 30 # SIGTERM 後、in-flight SSE の drain 上限秒 # # backends: -# - id: gpu-a # `x-aao-backend-id` / `/v1/models` に出る ID +# # role ベース routing: worker はジョブの role (auto/fast/quality/reflection) +# # を routing key として送り、Gateway はその role を担う最も空いている +# # backend に振る。異なる model 名の GPU でも同じ role でまとめられる。 +# # roles 未指定の backend は全 role を担当 (アップグレード後も従来どおり動く)。 +# - id: gpu-a # `x-aao-backend-id` に出る ID # endpoint: http://gpu-a:11434/v1 -# model: qwen3:32b # 厳密一致 routing +# model: qwen3:32b # backend が積むモデル (role が無い時の fallback 一致キー) +# roles: [quality] # この backend は quality ティア専用 # max_slots: 2 # llama-server -np と合わせる # api_key: ${GPU_A_API_KEY} # backend が bearer 必須な場合のみ # - id: gpu-b # endpoint: http://gpu-b:11434/v1 -# model: qwen3:32b +# model: qwen3:8b # 別 GPU は別モデルでも OK +# roles: [fast, auto] # fast/auto ティアを担当 # max_slots: 2 # # # Bootstrap / Backup 専用 virtual_keys (新規発行は admin API 経由を推奨)。 diff --git a/src/gateway/bootstrap.ts b/src/gateway/bootstrap.ts index dc1d39e..91e05e2 100644 --- a/src/gateway/bootstrap.ts +++ b/src/gateway/bootstrap.ts @@ -30,6 +30,7 @@ import { runMigrations } from '../db/migrate.js'; import { readGatewayConfig, validateGatewayConfig, + lintGatewayConfig, type GatewayBackendConfig, type GatewayConfig, } from './config.js'; @@ -206,6 +207,11 @@ export async function start(opts: StartGatewayOptions = {}): Promise { const norm = normalizeGatewayConfig({ enabled: 1 as unknown as boolean }); expect(norm.enabled).toBe(false); }); + + // --- Role-based routing (2026-06-09) --- + + it('parses backends[].roles into a deduped string array', () => { + const norm = normalizeGatewayConfig({ + backends: [ + { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['quality', 'quality', 'auto'] }, + ], + }); + expect(norm.backends[0]!.roles).toEqual(['quality', 'auto']); + }); + + it('drops empty / non-string role entries', () => { + const norm = normalizeGatewayConfig({ + backends: [ + { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['fast', '', 7 as unknown as string, ' '] }, + ], + }); + expect(norm.backends[0]!.roles).toEqual(['fast']); + }); + + it('leaves roles undefined when not provided (migration safety = serves all)', () => { + const norm = normalizeGatewayConfig({ + backends: [{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4 }], + }); + expect(norm.backends[0]!.roles).toBeUndefined(); + }); +}); + +describe('backendServesRole', () => { + it('matches a declared role', () => { + expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'quality')).toBe(true); + expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'fast')).toBe(false); + }); + + it('serves every default execution role when roles is unspecified', () => { + const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 }; + for (const r of DEFAULT_GATEWAY_BACKEND_ROLES) { + expect(backendServesRole(b, r)).toBe(true); + } + expect(backendServesRole(b, 'not-a-role')).toBe(false); + }); + + it('treats an empty roles array as unspecified (serves all)', () => { + const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: [] }; + expect(backendServesRole(b, 'auto')).toBe(true); + }); +}); + +describe('lintGatewayConfig', () => { + it('returns no warnings for a clean config', () => { + expect(lintGatewayConfig(makeValidConfig())).toEqual([]); + }); + + it('warns when a role name collides with another backend id', () => { + const c = makeValidConfig(); + c.backends = [ + { id: 'auto', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 }, + { id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['auto'] }, + ]; + const warns = lintGatewayConfig(c); + expect(warns.some(w => /auto/.test(w) && /collid/i.test(w))).toBe(true); + }); + + it('warns when a role name collides with a backend model', () => { + const c = makeValidConfig(); + c.backends = [ + { id: 'gpu-a', endpoint: 'http://x/v1', model: 'fast', maxSlots: 1 }, + { id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['fast'] }, + ]; + const warns = lintGatewayConfig(c); + expect(warns.some(w => /fast/.test(w) && /collid/i.test(w))).toBe(true); + }); }); describe('validateGatewayConfig', () => { diff --git a/src/gateway/config.ts b/src/gateway/config.ts index 8ab99e3..aa7579c 100644 --- a/src/gateway/config.ts +++ b/src/gateway/config.ts @@ -18,14 +18,45 @@ export interface GatewayBackendConfig { id: string; /** Upstream base URL — `/v1` suffix required so /chat/completions resolves cleanly. */ endpoint: string; - /** Model name this backend serves. The gateway routes by strict equality on `request.model`. */ + /** Model name this backend serves. Used as the backward-compat routing key (strict `request.model` equality). */ model: string; /** Concurrent slots the upstream can serve. Mirrors llama-server's `-np`. */ maxSlots: number; + /** + * Performance tiers (Roles) this backend serves: any of `auto`/`fast`/ + * `quality`/`reflection`. A worker sends the job's required role as the + * routing key; the gateway routes role → least-busy backend declaring it. + * This lets a heterogeneous GPU pool (different model names per box) be + * grouped by tier rather than by exact model name. + * + * Undefined / empty = serves every default execution role + * (DEFAULT_GATEWAY_BACKEND_ROLES) — keeps a pre-roles single-tier config + * working unchanged after upgrade. + */ + roles?: string[]; /** Optional bearer to forward to the backend (Ollama is usually unauthed). */ apiKey?: string; } +/** + * Roles an `roles`-less backend is assumed to serve. Covers every routing + * key a worker actually emits (job roles auto/fast/quality/reflection; + * title is mapped to `auto` worker-side). Tier separation is opt-in: it + * only takes effect once a backend declares an explicit `roles` subset. + */ +export const DEFAULT_GATEWAY_BACKEND_ROLES: readonly string[] = ['auto', 'fast', 'quality', 'reflection']; + +/** + * Does `backend` serve `role`? A backend with no explicit `roles` serves + * the full DEFAULT_GATEWAY_BACKEND_ROLES set (migration safety). The + * router uses this for role-match candidate selection; falls back to + * strict id/model matching only when NO backend serves the key. + */ +export function backendServesRole(backend: GatewayBackendConfig, role: string): boolean { + const roles = backend.roles && backend.roles.length > 0 ? backend.roles : DEFAULT_GATEWAY_BACKEND_ROLES; + return roles.includes(role); +} + export interface GatewayVirtualKey { /** Bearer string a client sends in `Authorization: Bearer `. */ key: string; @@ -190,11 +221,22 @@ function normalizeBackend(b: GatewayBackendConfig): GatewayBackendConfig { typeof ms === 'number' && Number.isFinite(ms) && ms > 0 ? Math.floor(ms) : NaN; + // Roles: keep undefined (= serves all) when absent, else dedupe and drop + // empty / non-string entries so a typo'd YAML list can't smuggle a blank + // role that matches nothing. + let roles: string[] | undefined; + if (Array.isArray(b.roles)) { + const cleaned = b.roles + .filter((r): r is string => typeof r === 'string' && r.trim().length > 0) + .map(r => r.trim()); + roles = [...new Set(cleaned)]; + } return { id: b.id, endpoint: b.endpoint, model: b.model, maxSlots, + ...(roles !== undefined ? { roles } : {}), apiKey: b.apiKey, }; } @@ -281,6 +323,11 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] { `gateway.backends[${b.id}].max_slots must be a positive integer (got ${b.maxSlots})`, ); } + if (b.roles !== undefined) { + if (!Array.isArray(b.roles) || b.roles.some(r => typeof r !== 'string' || r.trim() === '')) { + errors.push(`gateway.backends[${b.id}].roles must be an array of non-empty strings if defined`); + } + } } } @@ -351,6 +398,48 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] { return errors; } +/** + * Non-fatal lint pass — returns human-readable WARNINGS (empty = clean). + * Separate from validateGatewayConfig (which returns boot-fatal errors) + * because these conditions still route, just ambiguously. + * + * Currently flags role-name ↔ backend-id/model collisions: because the + * router resolves a routing key by role membership FIRST and only falls + * back to id/model exact match, a backend whose id or model equals a role + * another backend serves can never be reached by that literal name. The + * operator probably didn't intend the shadowing — warn, don't block. + */ +export function lintGatewayConfig(cfg: GatewayConfig): string[] { + const warnings: string[] = []; + if (!Array.isArray(cfg.backends)) return warnings; + + // Every role any backend declares (explicit or default-implied). + const declaredRoles = new Set(); + for (const b of cfg.backends) { + const roles = b.roles && b.roles.length > 0 ? b.roles : DEFAULT_GATEWAY_BACKEND_ROLES; + for (const r of roles) declaredRoles.add(r); + } + + for (const b of cfg.backends) { + // Only flag EXPLICIT id/model collisions — a default-roles backend + // (which serves every role) isn't a "name" the operator chose to + // shadow, so don't drown them in warnings. + if (declaredRoles.has(b.id)) { + warnings.push( + `gateway.backends[${b.id}]: backend id '${b.id}' collides with a role name served by the pool — ` + + `role routing shadows it, so '${b.id}' is unreachable as a literal model name`, + ); + } + if (b.model && b.model !== b.id && declaredRoles.has(b.model)) { + warnings.push( + `gateway.backends[${b.id}]: model '${b.model}' collides with a role name served by the pool — ` + + `role routing shadows it, so '${b.model}' is unreachable as a literal model name`, + ); + } + } + return warnings; +} + /** * Pull the (possibly missing) gateway block off AppConfig. Used by * gateway/bootstrap.ts so both worker and gateway modes can call diff --git a/src/gateway/models-endpoint.test.ts b/src/gateway/models-endpoint.test.ts index eda13e4..c172703 100644 --- a/src/gateway/models-endpoint.test.ts +++ b/src/gateway/models-endpoint.test.ts @@ -11,9 +11,13 @@ describe('buildModelsResponse', () => { expect(resp.data).toEqual([]); }); - it('maps each backend id to a model entry', () => { + it('maps each backend id to a model entry (id-shaped, not role-shaped)', () => { + // /v1/models stays backend-id-shaped: the Settings "physical backends" + // panel (config-api /api/workers/:id/backends) reads it to list the GPUs + // behind a proxy worker. Role-based routing lives in the router + worker, + // not in this discovery endpoint (2026-06-09 decision). const resp = buildModelsResponse([ - { id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4 }, + { id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4, roles: ['quality'] }, { id: 'gpu-b', endpoint: 'http://b/v1', model: 'qwen3:14b', maxSlots: 4 }, ]); expect(resp.data).toEqual([ diff --git a/src/gateway/models-endpoint.ts b/src/gateway/models-endpoint.ts index 89a0e1e..9abd7ac 100644 --- a/src/gateway/models-endpoint.ts +++ b/src/gateway/models-endpoint.ts @@ -4,7 +4,15 @@ * Returns one entry per `gateway.backends[]` with `id` set to the backend * id (which is also what `x-aao-backend-id` and `x-litellm-model-id` will * carry in responses). Clients hit this endpoint to discover which model - * names they may request. + * names they may request, and `/api/workers/:id/backends` (the Settings + * "physical backends" panel) reads it to list the GPUs behind a proxy + * worker — so it stays backend-id-shaped, NOT role-shaped. + * + * Role-based routing (2026-06-09) lives entirely in the router + the worker + * (which sends a role as the routing key). The over-concentration bug was + * fixed worker-side by dropping model auto-select in gateway mode, so this + * endpoint did not need to change. A client may still request a role: the + * router's role-match arm handles it even though roles aren't advertised here. * * Auth required: the response surfaces backend ids which could otherwise * be enumerated by an unauthenticated probe. diff --git a/src/gateway/router.test.ts b/src/gateway/router.test.ts index 5ad1430..9360e23 100644 --- a/src/gateway/router.test.ts +++ b/src/gateway/router.test.ts @@ -41,6 +41,10 @@ function bk(id: string, model: string, maxSlots = 4): GatewayBackendConfig { return { id, endpoint: `http://${id}/v1`, model, maxSlots }; } +function bkRoles(id: string, model: string, roles: string[], maxSlots = 4): GatewayBackendConfig { + return { id, endpoint: `http://${id}/v1`, model, maxSlots, roles }; +} + describe('createRouter.pick', () => { it('returns null when model has no configured backend', () => { const r = createRouter({ @@ -195,6 +199,95 @@ describe('createRouter.pick', () => { }); }); +describe('createRouter.pick — role-based routing (2026-06-09)', () => { + it('routes a role key only to backends that declare the role', () => { + // Heterogeneous pool: two GPUs serve the quality tier with DIFFERENT + // model names; one serves fast. A request for "quality" must never + // land on the fast-only backend even though it is the most idle. + const r = createRouter({ + getBackends: () => [ + bkRoles('q1', 'big-model-a.gguf', ['quality']), + bkRoles('q2', 'big-model-b.gguf', ['quality']), + bkRoles('f1', 'small-model.gguf', ['fast']), + ], + registry: makeRegistry([ + status('q1', true, 3, 4), // ratio 0.75 + status('q2', true, 1, 4), // ratio 0.25 (winner within the role) + status('f1', true, 0, 4), // idle but WRONG role + ]), + }); + expect(r.pick('quality')?.id).toBe('q2'); + }); + + it('returns null (503) when every backend serving the role is saturated — no spill to other tiers', () => { + const r = createRouter({ + getBackends: () => [ + bkRoles('q1', 'a.gguf', ['quality'], 2), + bkRoles('f1', 'b.gguf', ['fast'], 2), + ], + registry: makeRegistry([ + status('q1', true, 2, 2), // quality full + status('f1', true, 0, 2), // fast idle — must NOT be picked for quality + ]), + }); + expect(r.pick('quality')).toBeNull(); + }); + + it('enforces per-id max_slots independently within a role', () => { + const inflight = createBackendInflightCounter(); + const r = createRouter({ + getBackends: () => [ + bkRoles('q1', 'a.gguf', ['quality'], 1), + bkRoles('q2', 'b.gguf', ['quality'], 1), + ], + registry: makeRegistry([status('q1', true, 0, 1), status('q2', true, 0, 1)]), + inflight, + }); + const p1 = r.pick('quality')!.id; + const p2 = r.pick('quality')!.id; + expect(new Set([p1, p2])).toEqual(new Set(['q1', 'q2'])); // one each + expect(r.pick('quality')).toBeNull(); // both reserved + }); + + it('treats a backend with unspecified roles as serving every role', () => { + const r = createRouter({ + getBackends: () => [bk('generalist', 'm.gguf')], // no roles => all + registry: makeRegistry([status('generalist', true, 0, 4)]), + }); + expect(r.pick('auto')?.id).toBe('generalist'); + expect(r.pick('fast')?.id).toBe('generalist'); + expect(r.pick('quality')?.id).toBe('generalist'); + }); + + it('falls back to id/model exact match when NO backend declares the role (backward compat)', () => { + // A legacy caller sends a literal model name. No backend declares it + // as a role, so the router falls back to strict id/model matching. + const r = createRouter({ + getBackends: () => [bkRoles('q1', 'legacy-model.gguf', ['quality'])], + registry: makeRegistry([status('q1', true, 0, 4)]), + }); + expect(r.pick('legacy-model.gguf')?.id).toBe('q1'); // model-name fallback + expect(r.pick('q1')?.id).toBe('q1'); // id fallback + }); + + it('prefers role match over an id/model collision (role takes precedence)', () => { + // 'quality' is BOTH a declared role on roleBackend AND the id of a + // different backend. Role membership wins per decision 3. + const r = createRouter({ + getBackends: () => [ + bkRoles('roleBackend', 'm-a.gguf', ['quality']), + bkRoles('quality', 'm-b.gguf', ['fast']), // id literally "quality" but serves fast — the trap + ], + registry: makeRegistry([ + status('roleBackend', true, 0, 4), + status('quality', true, 0, 4), + ]), + }); + // Only the role-declaring backend is eligible; the id-collision one is ignored. + for (let i = 0; i < 5; i++) expect(r.pick('quality')?.id).toBe('roleBackend'); + }); +}); + describe('createRouter.pick — live reservation (per-host max_slots enforcement)', () => { it('reserves a slot on pick so a maxSlots=1 backend saturates after one pick', () => { // Root-cause regression: before the fix, pick() judged saturation diff --git a/src/gateway/router.ts b/src/gateway/router.ts index 73c625d..f73d1f2 100644 --- a/src/gateway/router.ts +++ b/src/gateway/router.ts @@ -2,16 +2,19 @@ * Least-busy backend router. * * Selection algorithm: - * 1. Filter to backends whose configured `id` OR `model` strictly - * matches `request.model`. Accepting both means a client can - * either name a specific deployment (`b.id`, mirrors LiteLLM's - * deployment-name convention used in our /v1/models + /health - * responses) or fall back to the generic model alias (`b.model`, - * for tools that pass through the raw model name configured on - * the worker side). Without the `b.id` arm, the gateway exposed - * ids via /v1/models that the worker would faithfully echo back - * in the chat request, and routing would 503 because nothing - * matched on b.model. Discovered during 2026-05-20 dogfooding. + * 1. Pick the candidate pool by ROLE first (2026-06-09). The routing + * key (`request.model`) is normally a performance tier a worker + * sends — auto/fast/quality/reflection. Backends declare which + * tiers they serve via `backends[].roles`; a backend with no + * `roles` serves them all. This groups a heterogeneous GPU pool + * (different model names per box) by tier rather than by exact + * model name. Membership is by config, not live load, so a fully + * saturated tier returns 503 instead of spilling to another tier. + * Only when NO backend serves the key do we fall back to the + * legacy strict `id` OR `model` match — keeping pre-roles configs + * and raw OpenAI clients (which name a deployment id/model alias, + * mirrored in /v1/models + /health) working. The id arm fixed a + * 503 found during 2026-05-20 dogfooding. * 2. Drop backends the registry says are offline or saturated * (`busy_slots >= total_slots`). * 3. Sort the survivors by `busy_slots / total_slots` ascending — @@ -27,7 +30,7 @@ * predictable in logs. The bias is at most O(slots) over the lifetime * of one process, which is fine for a single binary. */ -import type { GatewayBackendConfig } from './config.js'; +import { backendServesRole, type GatewayBackendConfig } from './config.js'; import type { BackendStatusRegistry, NodeStatus } from '../engine/backend-status-registry.js'; export interface RouterSnapshotBackend { @@ -122,7 +125,24 @@ export function createRouter(deps: CreateRouterDeps): Router { deps.registry.noteSubscriberActivity?.(); if (typeof model !== 'string' || model.length === 0) return null; - const backends = deps.getBackends(); + const allBackends = deps.getBackends(); + // Candidate pool selection (role-based, dual-match — 2026-06-09). + // + // The routing key (`model`) is normally a performance tier/role the + // worker sends (auto/fast/quality/reflection). Route to backends that + // DECLARE that role first; this groups a heterogeneous GPU pool by + // tier instead of by exact model name. Role membership is by config + // (backendServesRole), NOT by live availability — so if any backend + // serves the role, we stay inside that tier even when all of them are + // saturated (→ 503), rather than leaking the request to another tier. + // + // Only when NO backend serves the key do we fall back to the legacy + // strict id/model match. That keeps pre-roles configs and non-maestro + // OpenAI clients (which send a literal model name) working unchanged. + const roleMatched = allBackends.filter(b => backendServesRole(b, model)); + const backends = roleMatched.length > 0 + ? roleMatched + : allBackends.filter(b => b.id === model || b.model === model); // One getAll() per pick, not one per backend — the previous // `lookupStatus(b.id)` inside the loop did `registry.getAll()` // (which slices its internal cache) once per candidate. With N @@ -137,11 +157,9 @@ export function createRouter(deps: CreateRouterDeps): Router { const warm: RankedCandidate[] = []; const cold: RankedCandidate[] = []; for (const b of backends) { - // Match either the deployment id (LiteLLM-style routing key - // mirrored in /v1/models + /health) or the model alias the - // backend serves. See top-of-file docstring for why both arms - // are needed. - if (b.id !== model && b.model !== model) continue; + // `backends` is already filtered to the chosen candidate pool + // (role match, else id/model fallback) — no per-backend match + // check needed here. // Live reservation count for this backend (0 when no inflight // counter is wired). Used both to gate admission and to rank. const reserved = deps.inflight?.get(b.id) ?? 0; diff --git a/src/llm/routing-key.test.ts b/src/llm/routing-key.test.ts new file mode 100644 index 0000000..201f547 --- /dev/null +++ b/src/llm/routing-key.test.ts @@ -0,0 +1,124 @@ +/** + * Routing-key selection tests. + * + * The single decision shared by every LLM call site: gateway (proxy) mode + * routes by Role; direct mode sends the resolved model name. See + * docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md. + */ +import { describe, it, expect, vi } from 'vitest'; +import { llmRoutingKey, shouldRequeueForModelMismatch } from './routing-key.js'; + +describe('llmRoutingKey', () => { + it('gateway mode sends the role as the key', () => { + const key = llmRoutingKey({ + isGateway: true, + role: 'quality', + resolveDirectModel: () => 'should-not-be-used.gguf', + }); + expect(key).toBe('quality'); + }); + + it('gateway mode never evaluates the direct-model resolver (no /v1/models, no auto-select)', () => { + const resolveDirectModel = vi.fn(() => 'm'); + llmRoutingKey({ isGateway: true, role: 'fast', resolveDirectModel }); + expect(resolveDirectModel).not.toHaveBeenCalled(); + }); + + it('gateway mode falls back to "auto" when role is undefined', () => { + expect( + llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm' }), + ).toBe('auto'); + }); + + it('gateway mode honours a custom role fallback (e.g. reflection)', () => { + expect( + llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm', roleFallback: 'reflection' }), + ).toBe('reflection'); + }); + + it('direct mode returns the resolved model and ignores the role', () => { + const key = llmRoutingKey({ + isGateway: false, + role: 'quality', + resolveDirectModel: () => 'qwen3:8b', + }); + expect(key).toBe('qwen3:8b'); + }); + + it('direct mode preserves an undefined model (llama-server model omission)', () => { + expect( + llmRoutingKey({ isGateway: false, role: 'auto', resolveDirectModel: () => undefined }), + ).toBeUndefined(); + }); +}); + +describe('shouldRequeueForModelMismatch', () => { + it('NEVER requeues in gateway mode — availableModels holds roles, gateway owns model selection', () => { + // The regression: in gateway mode availableModels = advertised roles + // (auto/fast/quality), so a piece pinning a concrete model would never + // match and the job would requeue forever. + expect( + shouldRequeueForModelMismatch({ + isGateway: true, + pieceModel: 'qwen3:32b', + availableModels: new Set(['auto', 'fast', 'quality']), + workerModel: '', + }), + ).toBe(false); + }); + + it('direct mode requeues when the piece pins an unavailable model', () => { + expect( + shouldRequeueForModelMismatch({ + isGateway: false, + pieceModel: 'qwen3:32b', + availableModels: new Set(['qwen3:8b']), + workerModel: 'qwen3:8b', + }), + ).toBe(true); + }); + + it('direct mode does NOT requeue when the piece model is available', () => { + expect( + shouldRequeueForModelMismatch({ + isGateway: false, + pieceModel: 'qwen3:8b', + availableModels: new Set(['qwen3:8b', 'qwen3:32b']), + workerModel: 'qwen3:8b', + }), + ).toBe(false); + }); + + it('direct mode does NOT requeue when the piece model equals the worker model (probe empty)', () => { + expect( + shouldRequeueForModelMismatch({ + isGateway: false, + pieceModel: 'qwen3:8b', + availableModels: new Set(['other']), + workerModel: 'qwen3:8b', + }), + ).toBe(false); + }); + + it('direct mode does NOT requeue when the piece pins no model', () => { + expect( + shouldRequeueForModelMismatch({ + isGateway: false, + pieceModel: undefined, + availableModels: new Set(['qwen3:8b']), + workerModel: 'qwen3:8b', + }), + ).toBe(false); + }); + + it('direct mode does NOT requeue before the model list is probed (size 0)', () => { + expect( + shouldRequeueForModelMismatch({ + isGateway: false, + pieceModel: 'qwen3:32b', + availableModels: new Set(), + workerModel: '', + }), + ).toBe(false); + }); +}); diff --git a/src/llm/routing-key.ts b/src/llm/routing-key.ts new file mode 100644 index 0000000..51b0700 --- /dev/null +++ b/src/llm/routing-key.ts @@ -0,0 +1,61 @@ +/** + * The single decision shared by every LLM call site (job execution, ASK + * answer, title generation, reflection): what to put in `request.model`. + * + * - Gateway (proxy) mode routes by ROLE. The AAO Gateway resolves the role + * (auto/fast/quality/reflection) to a least-busy backend declaring it, so + * a heterogeneous GPU pool stays grouped by performance tier instead of by + * exact model name. No /v1/models discovery, no model auto-select — those + * collapsed every tier onto backends[0] (the 2026-06-09 over-concentration + * bug). + * - Direct mode sends the resolved model name (piece override, configured + * model, or omission for llama-server) exactly as before. + * + * The direct-model side is a thunk so gateway mode never runs the (logging, + * auto-select) resolution path. See + * docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md. + */ +export function llmRoutingKey(params: { + /** True when the worker connects through the AAO Gateway (connection_type: aao_gateway → proxy). */ + isGateway: boolean; + /** The job's required tier (or the worker's serving tier for title generation). */ + role: string | undefined; + /** Lazily resolves the direct-mode model. NOT called in gateway mode. */ + resolveDirectModel: () => string | undefined; + /** Role key when `role` is undefined. Defaults to 'auto'. */ + roleFallback?: string; +}): string | undefined { + if (params.isGateway) { + return params.role ?? params.roleFallback ?? 'auto'; + } + return params.resolveDirectModel(); +} + +/** + * Should the worker requeue a job because the piece pins a model this worker + * cannot serve? + * + * Direct mode: requeue when the piece names a concrete model that is neither + * in the probed `availableModels` set nor the worker's own configured model + * — another worker may serve it. + * + * Gateway mode: ALWAYS false. The worker routes by role and `availableModels` + * holds the gateway's advertised role set (not model names), so a piece that + * pins a real model would never match and the job would requeue forever. The + * gateway owns model selection there. See + * docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md. + */ +export function shouldRequeueForModelMismatch(params: { + isGateway: boolean; + pieceModel: string | undefined; + availableModels: Set; + workerModel: string | undefined; +}): boolean { + if (params.isGateway) return false; + return ( + !!params.pieceModel && + params.availableModels.size > 0 && + !params.availableModels.has(params.pieceModel) && + params.workerModel !== params.pieceModel + ); +} diff --git a/src/worker-bootstrap.ts b/src/worker-bootstrap.ts index 6d0477e..526e322 100644 --- a/src/worker-bootstrap.ts +++ b/src/worker-bootstrap.ts @@ -19,6 +19,7 @@ import { logger } from './logger.js'; import { accessSync, existsSync, mkdirSync, constants } from 'fs'; import { dirname, resolve, join } from 'path'; import { OpenAICompatClient } from './llm/openai-compat.js'; +import { llmRoutingKey } from './llm/routing-key.js'; import { ConfigManager } from './config-manager.js'; import { WorkerManager } from './worker-manager.js'; import { classifyPiece } from './engine/piece-classifier.js'; @@ -127,10 +128,23 @@ export async function start(opts: StartWorkerOptions = {}): Promise { if (titleWorker) { const titleModel = titleWorker.model ?? config.provider.model; - logger.info(`Config: title generation worker=${titleWorker.id} model=${titleModel ?? ''}`); + // Gateway (proxy) mode routes by role, not model name. Title generation + // is cheap, so send a tier the title worker serves (first of auto/fast/ + // quality), defaulting to 'auto'. Direct mode sends the configured model. + const titleTier = + titleWorker.roles?.find(r => r === 'auto' || r === 'fast' || r === 'quality') ?? 'auto'; + const titleRoutingKey = llmRoutingKey({ + isGateway: titleWorker.proxy === true, + role: titleTier, + resolveDirectModel: () => titleModel, + }); + logger.info( + `Config: title generation worker=${titleWorker.id} key=${titleRoutingKey ?? ''}` + + (titleWorker.proxy === true ? ' (gateway:role)' : ''), + ); titleClient = new OpenAICompatClient( titleWorker.endpoint, - titleModel, + titleRoutingKey, undefined, config.provider.retry, (config.provider.timeoutMinutes ?? 10) * 60 * 1000, diff --git a/src/worker.ts b/src/worker.ts index 0b63d60..91496c5 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -4,6 +4,7 @@ import { BrowserSessionRepo } from './db/browser-session-repo.js'; import { assertProfileOwner } from './engine/browser-session-auth.js'; import { initMasterKey, decryptUserDek, decryptStateBlob } from './crypto/sessions.js'; import { OpenAICompatClient } from './llm/openai-compat.js'; +import { llmRoutingKey, shouldRequeueForModelMismatch } from './llm/routing-key.js'; import { loadPiece, runPiece, PieceRunCallbacks, PieceDef, type PieceRunResult } from './engine/piece-runner.js'; import { LocalProgressReporter } from './progress/local-reporter.js'; import { buildLocalConversationContext } from './engine/local-context.js'; @@ -697,9 +698,14 @@ export class Worker { const parentJob = await this.repo.getJob(parentJobId); const parentInstruction = parentJob?.instruction ?? '(不明)'; - const resolvedModel = this.model; - const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000; const workerDefForAnswer = this.getWorkerDef(); + // Gateway routes by the subtask's tier; direct keeps the worker's model. + const resolvedModel = llmRoutingKey({ + isGateway: workerDefForAnswer.proxy === true, + role: subtaskJob.requiredRole, + resolveDirectModel: () => this.model, + }); + const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000; const llmClient = new OpenAICompatClient( this.endpoint, resolvedModel, @@ -954,11 +960,15 @@ export class Worker { ].filter((d): d is string => !!d); logger.info(`[worker:${this.workerId}] job ${jobId} loadPiece piece=${job.pieceName} customDirs=[${customPieceDirs.join(', ') || 'none'}] piecesDir=pieces`); const piece = loadPiece(job.pieceName, 'pieces', customPieceDirs); + // Model-mismatch requeue gate (direct mode only — gateway routes by + // role, see shouldRequeueForModelMismatch). if ( - piece.model && - this.availableModels.size > 0 && - !this.availableModels.has(piece.model) && - this.model !== piece.model + shouldRequeueForModelMismatch({ + isGateway: this.getWorkerDef().proxy === true, + pieceModel: piece.model, + availableModels: this.availableModels, + workerModel: this.model, + }) ) { await this.repo.updateJob(jobId, { status: 'queued', @@ -996,11 +1006,16 @@ export class Worker { return; } - // Piece のモデル指定を解決 - const resolvedModel = this.resolveModel(piece); - const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000; const workerDefForLlm = this.getWorkerDef(); const isProxyWorker = workerDefForLlm.proxy === true; + // Gateway routes by role; direct resolves the worker's model. The + // resolver thunk runs only in direct mode (no auto-select via gateway). + const resolvedModel = llmRoutingKey({ + isGateway: isProxyWorker, + role: job.requiredRole, + resolveDirectModel: () => this.resolveModel(piece), + }); + const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000; const llmClient = new OpenAICompatClient( this.endpoint, resolvedModel, @@ -1652,12 +1667,20 @@ export class Worker { private async handleReflectionJob(job: Job): Promise { const { runReflectionJob } = await import('./engine/reflection/reflection-runner.js'); try { + // Gateway mode routes by role: send the reflection tier as the key + // (job.requiredRole is 'reflection'), not the worker's model name. + const reflectionRoutingKey = llmRoutingKey({ + isGateway: this.getWorkerDef().proxy === true, + role: job.requiredRole, + resolveDirectModel: () => this.model, + roleFallback: 'reflection', + }); const outcome = await runReflectionJob( { repo: this.repo, config: this.config, llmEndpoint: this.endpoint, - llmModel: this.model, + llmModel: reflectionRoutingKey, }, job ); diff --git a/ui/src/components/settings/GatewayServerForm.tsx b/ui/src/components/settings/GatewayServerForm.tsx index 2848752..4bcd954 100644 --- a/ui/src/components/settings/GatewayServerForm.tsx +++ b/ui/src/components/settings/GatewayServerForm.tsx @@ -38,9 +38,28 @@ interface GatewayBackend { endpoint?: string; model?: string; maxSlots?: number; + /** Performance tiers (Roles) this backend serves. Empty = serves all. */ + roles?: string[]; apiKey?: string; } +/** Format a roles array for the comma-separated text input. */ +function rolesToInput(roles: string[] | undefined): string { + return Array.isArray(roles) ? roles.join(', ') : ''; +} + +/** + * Parse the comma-separated roles input into a deduped string array, or + * `undefined` when blank (= backend serves every role, migration-safe). + */ +function parseRolesInput(v: string): string[] | undefined { + const parts = v + .split(',') + .map(s => s.trim()) + .filter(s => s.length > 0); + return parts.length > 0 ? [...new Set(parts)] : undefined; +} + interface GatewayConfigShape { enabled?: boolean; listenPort?: number; @@ -246,7 +265,7 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) { - ルーティング先の llama-server / Ollama / vLLM など。Gateway は request.model に一致する model を持つ最も busy ではない backend に割り振ります。
+ ルーティング先の llama-server / Ollama / vLLM など。Gateway は worker が送る role を担う backend のうち最も空いているものに割り振ります (roles 未設定の backend は全 role 対応)。role を担う backend が無い場合のみ request.model = id/model の厳密一致にフォールバックします。
api_key の保存形式: フォームで入力した値は config.yaml に平文で保存されます。${'${VAR}'} 形式の env var 参照はフォーム保存時に literal 文字列として保存されるため、env 経由で渡したい場合は config.yaml を直接編集してください。
{backends.length === 0 ? ( @@ -291,6 +310,17 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) { placeholder="1" /> +
+ roles (任意) + updateBackend(i, 'roles', parseRolesInput(v))} + placeholder="quality, auto (空欄=全ロール)" + /> + + この backend が担う性能ティア (auto / fast / quality / reflection) をカンマ区切りで。worker はジョブの role を routing key として送り、Gateway がその role を担う最も空いている backend に振ります。空欄なら全ロールを担当 (従来どおり)。異なる model 名の GPU でも同じ role でまとめられます。 + +
api_key (任意)