sync: update from private repo (22cd034)
Some checks failed
CI / build-and-test (push) Has been cancelled

This commit is contained in:
oss-sync 2026-06-09 06:02:32 +00:00
parent 3848b5efd7
commit 8ac98d2630
13 changed files with 589 additions and 37 deletions

View File

@ -117,14 +117,20 @@ llm:
# shutdown_graceful_sec: 30 # SIGTERM 後、in-flight SSE の drain 上限秒
#
# backends:
# - id: gpu-a # `x-aao-backend-id` / `/v1/models` に出る ID
# # role ベース routing: worker はジョブの role (auto/fast/quality/reflection)
# # を routing key として送り、Gateway はその role を担う最も空いている
# # backend に振る。異なる model 名の GPU でも同じ role でまとめられる。
# # roles 未指定の backend は全 role を担当 (アップグレード後も従来どおり動く)。
# - id: gpu-a # `x-aao-backend-id` に出る ID
# endpoint: http://gpu-a:11434/v1
# model: qwen3:32b # 厳密一致 routing
# model: qwen3:32b # backend が積むモデル (role が無い時の fallback 一致キー)
# roles: [quality] # この backend は quality ティア専用
# max_slots: 2 # llama-server -np と合わせる
# api_key: ${GPU_A_API_KEY} # backend が bearer 必須な場合のみ
# - id: gpu-b
# endpoint: http://gpu-b:11434/v1
# model: qwen3:32b
# model: qwen3:8b # 別 GPU は別モデルでも OK
# roles: [fast, auto] # fast/auto ティアを担当
# max_slots: 2
#
# # Bootstrap / Backup 専用 virtual_keys (新規発行は admin API 経由を推奨)。

View File

@ -30,6 +30,7 @@ import { runMigrations } from '../db/migrate.js';
import {
readGatewayConfig,
validateGatewayConfig,
lintGatewayConfig,
type GatewayBackendConfig,
type GatewayConfig,
} from './config.js';
@ -206,6 +207,11 @@ export async function start(opts: StartGatewayOptions = {}): Promise<StartedGate
throw new Error(`gateway config has ${validationErrors.length} error(s); refusing to start`);
}
// Non-fatal lint (role↔id/model collisions). Routes fine, just ambiguously.
for (const w of lintGatewayConfig(gatewayConfig)) {
logger.warn(`[gateway-bootstrap] config warning: ${w}`);
}
logger.info(
`[gateway-bootstrap] enabled listen_port=${gatewayConfig.listenPort} backends=${gatewayConfig.backends.length} virtual_keys=${gatewayConfig.virtualKeys.length}`,
);

View File

@ -11,8 +11,11 @@ import { describe, it, expect } from 'vitest';
import {
normalizeGatewayConfig,
validateGatewayConfig,
lintGatewayConfig,
backendServesRole,
readGatewayConfig,
DEFAULT_GATEWAY_LISTEN_PORT,
DEFAULT_GATEWAY_BACKEND_ROLES,
type GatewayConfig,
} from './config.js';
import type { AppConfig } from '../config.js';
@ -72,6 +75,79 @@ describe('normalizeGatewayConfig', () => {
const norm = normalizeGatewayConfig({ enabled: 1 as unknown as boolean });
expect(norm.enabled).toBe(false);
});
// --- Role-based routing (2026-06-09) ---
it('parses backends[].roles into a deduped string array', () => {
const norm = normalizeGatewayConfig({
backends: [
{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['quality', 'quality', 'auto'] },
],
});
expect(norm.backends[0]!.roles).toEqual(['quality', 'auto']);
});
it('drops empty / non-string role entries', () => {
const norm = normalizeGatewayConfig({
backends: [
{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['fast', '', 7 as unknown as string, ' '] },
],
});
expect(norm.backends[0]!.roles).toEqual(['fast']);
});
it('leaves roles undefined when not provided (migration safety = serves all)', () => {
const norm = normalizeGatewayConfig({
backends: [{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4 }],
});
expect(norm.backends[0]!.roles).toBeUndefined();
});
});
describe('backendServesRole', () => {
it('matches a declared role', () => {
expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'quality')).toBe(true);
expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'fast')).toBe(false);
});
it('serves every default execution role when roles is unspecified', () => {
const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 };
for (const r of DEFAULT_GATEWAY_BACKEND_ROLES) {
expect(backendServesRole(b, r)).toBe(true);
}
expect(backendServesRole(b, 'not-a-role')).toBe(false);
});
it('treats an empty roles array as unspecified (serves all)', () => {
const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: [] };
expect(backendServesRole(b, 'auto')).toBe(true);
});
});
describe('lintGatewayConfig', () => {
it('returns no warnings for a clean config', () => {
expect(lintGatewayConfig(makeValidConfig())).toEqual([]);
});
it('warns when a role name collides with another backend id', () => {
const c = makeValidConfig();
c.backends = [
{ id: 'auto', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 },
{ id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['auto'] },
];
const warns = lintGatewayConfig(c);
expect(warns.some(w => /auto/.test(w) && /collid/i.test(w))).toBe(true);
});
it('warns when a role name collides with a backend model', () => {
const c = makeValidConfig();
c.backends = [
{ id: 'gpu-a', endpoint: 'http://x/v1', model: 'fast', maxSlots: 1 },
{ id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['fast'] },
];
const warns = lintGatewayConfig(c);
expect(warns.some(w => /fast/.test(w) && /collid/i.test(w))).toBe(true);
});
});
describe('validateGatewayConfig', () => {

View File

@ -18,14 +18,45 @@ export interface GatewayBackendConfig {
id: string;
/** Upstream base URL — `/v1` suffix required so /chat/completions resolves cleanly. */
endpoint: string;
/** Model name this backend serves. The gateway routes by strict equality on `request.model`. */
/** Model name this backend serves. Used as the backward-compat routing key (strict `request.model` equality). */
model: string;
/** Concurrent slots the upstream can serve. Mirrors llama-server's `-np`. */
maxSlots: number;
/**
* Performance tiers (Roles) this backend serves: any of `auto`/`fast`/
* `quality`/`reflection`. A worker sends the job's required role as the
* routing key; the gateway routes role least-busy backend declaring it.
* This lets a heterogeneous GPU pool (different model names per box) be
* grouped by tier rather than by exact model name.
*
* Undefined / empty = serves every default execution role
* (DEFAULT_GATEWAY_BACKEND_ROLES) keeps a pre-roles single-tier config
* working unchanged after upgrade.
*/
roles?: string[];
/** Optional bearer to forward to the backend (Ollama is usually unauthed). */
apiKey?: string;
}
/**
* Roles an `roles`-less backend is assumed to serve. Covers every routing
* key a worker actually emits (job roles auto/fast/quality/reflection;
* title is mapped to `auto` worker-side). Tier separation is opt-in: it
* only takes effect once a backend declares an explicit `roles` subset.
*/
export const DEFAULT_GATEWAY_BACKEND_ROLES: readonly string[] = ['auto', 'fast', 'quality', 'reflection'];
/**
* Does `backend` serve `role`? A backend with no explicit `roles` serves
* the full DEFAULT_GATEWAY_BACKEND_ROLES set (migration safety). The
* router uses this for role-match candidate selection; falls back to
* strict id/model matching only when NO backend serves the key.
*/
export function backendServesRole(backend: GatewayBackendConfig, role: string): boolean {
const roles = backend.roles && backend.roles.length > 0 ? backend.roles : DEFAULT_GATEWAY_BACKEND_ROLES;
return roles.includes(role);
}
export interface GatewayVirtualKey {
/** Bearer string a client sends in `Authorization: Bearer <key>`. */
key: string;
@ -190,11 +221,22 @@ function normalizeBackend(b: GatewayBackendConfig): GatewayBackendConfig {
typeof ms === 'number' && Number.isFinite(ms) && ms > 0
? Math.floor(ms)
: NaN;
// Roles: keep undefined (= serves all) when absent, else dedupe and drop
// empty / non-string entries so a typo'd YAML list can't smuggle a blank
// role that matches nothing.
let roles: string[] | undefined;
if (Array.isArray(b.roles)) {
const cleaned = b.roles
.filter((r): r is string => typeof r === 'string' && r.trim().length > 0)
.map(r => r.trim());
roles = [...new Set(cleaned)];
}
return {
id: b.id,
endpoint: b.endpoint,
model: b.model,
maxSlots,
...(roles !== undefined ? { roles } : {}),
apiKey: b.apiKey,
};
}
@ -281,6 +323,11 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] {
`gateway.backends[${b.id}].max_slots must be a positive integer (got ${b.maxSlots})`,
);
}
if (b.roles !== undefined) {
if (!Array.isArray(b.roles) || b.roles.some(r => typeof r !== 'string' || r.trim() === '')) {
errors.push(`gateway.backends[${b.id}].roles must be an array of non-empty strings if defined`);
}
}
}
}
@ -351,6 +398,48 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] {
return errors;
}
/**
* Non-fatal lint pass returns human-readable WARNINGS (empty = clean).
* Separate from validateGatewayConfig (which returns boot-fatal errors)
* because these conditions still route, just ambiguously.
*
* Currently flags role-name backend-id/model collisions: because the
* router resolves a routing key by role membership FIRST and only falls
* back to id/model exact match, a backend whose id or model equals a role
* another backend serves can never be reached by that literal name. The
* operator probably didn't intend the shadowing — warn, don't block.
*/
export function lintGatewayConfig(cfg: GatewayConfig): string[] {
const warnings: string[] = [];
if (!Array.isArray(cfg.backends)) return warnings;
// Every role any backend declares (explicit or default-implied).
const declaredRoles = new Set<string>();
for (const b of cfg.backends) {
const roles = b.roles && b.roles.length > 0 ? b.roles : DEFAULT_GATEWAY_BACKEND_ROLES;
for (const r of roles) declaredRoles.add(r);
}
for (const b of cfg.backends) {
// Only flag EXPLICIT id/model collisions — a default-roles backend
// (which serves every role) isn't a "name" the operator chose to
// shadow, so don't drown them in warnings.
if (declaredRoles.has(b.id)) {
warnings.push(
`gateway.backends[${b.id}]: backend id '${b.id}' collides with a role name served by the pool — ` +
`role routing shadows it, so '${b.id}' is unreachable as a literal model name`,
);
}
if (b.model && b.model !== b.id && declaredRoles.has(b.model)) {
warnings.push(
`gateway.backends[${b.id}]: model '${b.model}' collides with a role name served by the pool — ` +
`role routing shadows it, so '${b.model}' is unreachable as a literal model name`,
);
}
}
return warnings;
}
/**
* Pull the (possibly missing) gateway block off AppConfig. Used by
* gateway/bootstrap.ts so both worker and gateway modes can call

View File

@ -11,9 +11,13 @@ describe('buildModelsResponse', () => {
expect(resp.data).toEqual([]);
});
it('maps each backend id to a model entry', () => {
it('maps each backend id to a model entry (id-shaped, not role-shaped)', () => {
// /v1/models stays backend-id-shaped: the Settings "physical backends"
// panel (config-api /api/workers/:id/backends) reads it to list the GPUs
// behind a proxy worker. Role-based routing lives in the router + worker,
// not in this discovery endpoint (2026-06-09 decision).
const resp = buildModelsResponse([
{ id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4 },
{ id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4, roles: ['quality'] },
{ id: 'gpu-b', endpoint: 'http://b/v1', model: 'qwen3:14b', maxSlots: 4 },
]);
expect(resp.data).toEqual([

View File

@ -4,7 +4,15 @@
* Returns one entry per `gateway.backends[]` with `id` set to the backend
* id (which is also what `x-aao-backend-id` and `x-litellm-model-id` will
* carry in responses). Clients hit this endpoint to discover which model
* names they may request.
* names they may request, and `/api/workers/:id/backends` (the Settings
* "physical backends" panel) reads it to list the GPUs behind a proxy
* worker so it stays backend-id-shaped, NOT role-shaped.
*
* Role-based routing (2026-06-09) lives entirely in the router + the worker
* (which sends a role as the routing key). The over-concentration bug was
* fixed worker-side by dropping model auto-select in gateway mode, so this
* endpoint did not need to change. A client may still request a role: the
* router's role-match arm handles it even though roles aren't advertised here.
*
* Auth required: the response surfaces backend ids which could otherwise
* be enumerated by an unauthenticated probe.

View File

@ -41,6 +41,10 @@ function bk(id: string, model: string, maxSlots = 4): GatewayBackendConfig {
return { id, endpoint: `http://${id}/v1`, model, maxSlots };
}
function bkRoles(id: string, model: string, roles: string[], maxSlots = 4): GatewayBackendConfig {
return { id, endpoint: `http://${id}/v1`, model, maxSlots, roles };
}
describe('createRouter.pick', () => {
it('returns null when model has no configured backend', () => {
const r = createRouter({
@ -195,6 +199,95 @@ describe('createRouter.pick', () => {
});
});
describe('createRouter.pick — role-based routing (2026-06-09)', () => {
it('routes a role key only to backends that declare the role', () => {
// Heterogeneous pool: two GPUs serve the quality tier with DIFFERENT
// model names; one serves fast. A request for "quality" must never
// land on the fast-only backend even though it is the most idle.
const r = createRouter({
getBackends: () => [
bkRoles('q1', 'big-model-a.gguf', ['quality']),
bkRoles('q2', 'big-model-b.gguf', ['quality']),
bkRoles('f1', 'small-model.gguf', ['fast']),
],
registry: makeRegistry([
status('q1', true, 3, 4), // ratio 0.75
status('q2', true, 1, 4), // ratio 0.25 (winner within the role)
status('f1', true, 0, 4), // idle but WRONG role
]),
});
expect(r.pick('quality')?.id).toBe('q2');
});
it('returns null (503) when every backend serving the role is saturated — no spill to other tiers', () => {
const r = createRouter({
getBackends: () => [
bkRoles('q1', 'a.gguf', ['quality'], 2),
bkRoles('f1', 'b.gguf', ['fast'], 2),
],
registry: makeRegistry([
status('q1', true, 2, 2), // quality full
status('f1', true, 0, 2), // fast idle — must NOT be picked for quality
]),
});
expect(r.pick('quality')).toBeNull();
});
it('enforces per-id max_slots independently within a role', () => {
const inflight = createBackendInflightCounter();
const r = createRouter({
getBackends: () => [
bkRoles('q1', 'a.gguf', ['quality'], 1),
bkRoles('q2', 'b.gguf', ['quality'], 1),
],
registry: makeRegistry([status('q1', true, 0, 1), status('q2', true, 0, 1)]),
inflight,
});
const p1 = r.pick('quality')!.id;
const p2 = r.pick('quality')!.id;
expect(new Set([p1, p2])).toEqual(new Set(['q1', 'q2'])); // one each
expect(r.pick('quality')).toBeNull(); // both reserved
});
it('treats a backend with unspecified roles as serving every role', () => {
const r = createRouter({
getBackends: () => [bk('generalist', 'm.gguf')], // no roles => all
registry: makeRegistry([status('generalist', true, 0, 4)]),
});
expect(r.pick('auto')?.id).toBe('generalist');
expect(r.pick('fast')?.id).toBe('generalist');
expect(r.pick('quality')?.id).toBe('generalist');
});
it('falls back to id/model exact match when NO backend declares the role (backward compat)', () => {
// A legacy caller sends a literal model name. No backend declares it
// as a role, so the router falls back to strict id/model matching.
const r = createRouter({
getBackends: () => [bkRoles('q1', 'legacy-model.gguf', ['quality'])],
registry: makeRegistry([status('q1', true, 0, 4)]),
});
expect(r.pick('legacy-model.gguf')?.id).toBe('q1'); // model-name fallback
expect(r.pick('q1')?.id).toBe('q1'); // id fallback
});
it('prefers role match over an id/model collision (role takes precedence)', () => {
// 'quality' is BOTH a declared role on roleBackend AND the id of a
// different backend. Role membership wins per decision 3.
const r = createRouter({
getBackends: () => [
bkRoles('roleBackend', 'm-a.gguf', ['quality']),
bkRoles('quality', 'm-b.gguf', ['fast']), // id literally "quality" but serves fast — the trap
],
registry: makeRegistry([
status('roleBackend', true, 0, 4),
status('quality', true, 0, 4),
]),
});
// Only the role-declaring backend is eligible; the id-collision one is ignored.
for (let i = 0; i < 5; i++) expect(r.pick('quality')?.id).toBe('roleBackend');
});
});
describe('createRouter.pick — live reservation (per-host max_slots enforcement)', () => {
it('reserves a slot on pick so a maxSlots=1 backend saturates after one pick', () => {
// Root-cause regression: before the fix, pick() judged saturation

View File

@ -2,16 +2,19 @@
* Least-busy backend router.
*
* Selection algorithm:
* 1. Filter to backends whose configured `id` OR `model` strictly
* matches `request.model`. Accepting both means a client can
* either name a specific deployment (`b.id`, mirrors LiteLLM's
* deployment-name convention used in our /v1/models + /health
* responses) or fall back to the generic model alias (`b.model`,
* for tools that pass through the raw model name configured on
* the worker side). Without the `b.id` arm, the gateway exposed
* ids via /v1/models that the worker would faithfully echo back
* in the chat request, and routing would 503 because nothing
* matched on b.model. Discovered during 2026-05-20 dogfooding.
* 1. Pick the candidate pool by ROLE first (2026-06-09). The routing
* key (`request.model`) is normally a performance tier a worker
* sends auto/fast/quality/reflection. Backends declare which
* tiers they serve via `backends[].roles`; a backend with no
* `roles` serves them all. This groups a heterogeneous GPU pool
* (different model names per box) by tier rather than by exact
* model name. Membership is by config, not live load, so a fully
* saturated tier returns 503 instead of spilling to another tier.
* Only when NO backend serves the key do we fall back to the
* legacy strict `id` OR `model` match keeping pre-roles configs
* and raw OpenAI clients (which name a deployment id/model alias,
* mirrored in /v1/models + /health) working. The id arm fixed a
* 503 found during 2026-05-20 dogfooding.
* 2. Drop backends the registry says are offline or saturated
* (`busy_slots >= total_slots`).
* 3. Sort the survivors by `busy_slots / total_slots` ascending
@ -27,7 +30,7 @@
* predictable in logs. The bias is at most O(slots) over the lifetime
* of one process, which is fine for a single binary.
*/
import type { GatewayBackendConfig } from './config.js';
import { backendServesRole, type GatewayBackendConfig } from './config.js';
import type { BackendStatusRegistry, NodeStatus } from '../engine/backend-status-registry.js';
export interface RouterSnapshotBackend {
@ -122,7 +125,24 @@ export function createRouter(deps: CreateRouterDeps): Router {
deps.registry.noteSubscriberActivity?.();
if (typeof model !== 'string' || model.length === 0) return null;
const backends = deps.getBackends();
const allBackends = deps.getBackends();
// Candidate pool selection (role-based, dual-match — 2026-06-09).
//
// The routing key (`model`) is normally a performance tier/role the
// worker sends (auto/fast/quality/reflection). Route to backends that
// DECLARE that role first; this groups a heterogeneous GPU pool by
// tier instead of by exact model name. Role membership is by config
// (backendServesRole), NOT by live availability — so if any backend
// serves the role, we stay inside that tier even when all of them are
// saturated (→ 503), rather than leaking the request to another tier.
//
// Only when NO backend serves the key do we fall back to the legacy
// strict id/model match. That keeps pre-roles configs and non-maestro
// OpenAI clients (which send a literal model name) working unchanged.
const roleMatched = allBackends.filter(b => backendServesRole(b, model));
const backends = roleMatched.length > 0
? roleMatched
: allBackends.filter(b => b.id === model || b.model === model);
// One getAll() per pick, not one per backend — the previous
// `lookupStatus(b.id)` inside the loop did `registry.getAll()`
// (which slices its internal cache) once per candidate. With N
@ -137,11 +157,9 @@ export function createRouter(deps: CreateRouterDeps): Router {
const warm: RankedCandidate[] = [];
const cold: RankedCandidate[] = [];
for (const b of backends) {
// Match either the deployment id (LiteLLM-style routing key
// mirrored in /v1/models + /health) or the model alias the
// backend serves. See top-of-file docstring for why both arms
// are needed.
if (b.id !== model && b.model !== model) continue;
// `backends` is already filtered to the chosen candidate pool
// (role match, else id/model fallback) — no per-backend match
// check needed here.
// Live reservation count for this backend (0 when no inflight
// counter is wired). Used both to gate admission and to rank.
const reserved = deps.inflight?.get(b.id) ?? 0;

124
src/llm/routing-key.test.ts Normal file
View File

@ -0,0 +1,124 @@
/**
* Routing-key selection tests.
*
* The single decision shared by every LLM call site: gateway (proxy) mode
* routes by Role; direct mode sends the resolved model name. See
* docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
*/
import { describe, it, expect, vi } from 'vitest';
import { llmRoutingKey, shouldRequeueForModelMismatch } from './routing-key.js';
describe('llmRoutingKey', () => {
it('gateway mode sends the role as the key', () => {
const key = llmRoutingKey({
isGateway: true,
role: 'quality',
resolveDirectModel: () => 'should-not-be-used.gguf',
});
expect(key).toBe('quality');
});
it('gateway mode never evaluates the direct-model resolver (no /v1/models, no auto-select)', () => {
const resolveDirectModel = vi.fn(() => 'm');
llmRoutingKey({ isGateway: true, role: 'fast', resolveDirectModel });
expect(resolveDirectModel).not.toHaveBeenCalled();
});
it('gateway mode falls back to "auto" when role is undefined', () => {
expect(
llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm' }),
).toBe('auto');
});
it('gateway mode honours a custom role fallback (e.g. reflection)', () => {
expect(
llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm', roleFallback: 'reflection' }),
).toBe('reflection');
});
it('direct mode returns the resolved model and ignores the role', () => {
const key = llmRoutingKey({
isGateway: false,
role: 'quality',
resolveDirectModel: () => 'qwen3:8b',
});
expect(key).toBe('qwen3:8b');
});
it('direct mode preserves an undefined model (llama-server model omission)', () => {
expect(
llmRoutingKey({ isGateway: false, role: 'auto', resolveDirectModel: () => undefined }),
).toBeUndefined();
});
});
describe('shouldRequeueForModelMismatch', () => {
it('NEVER requeues in gateway mode — availableModels holds roles, gateway owns model selection', () => {
// The regression: in gateway mode availableModels = advertised roles
// (auto/fast/quality), so a piece pinning a concrete model would never
// match and the job would requeue forever.
expect(
shouldRequeueForModelMismatch({
isGateway: true,
pieceModel: 'qwen3:32b',
availableModels: new Set(['auto', 'fast', 'quality']),
workerModel: '',
}),
).toBe(false);
});
it('direct mode requeues when the piece pins an unavailable model', () => {
expect(
shouldRequeueForModelMismatch({
isGateway: false,
pieceModel: 'qwen3:32b',
availableModels: new Set(['qwen3:8b']),
workerModel: 'qwen3:8b',
}),
).toBe(true);
});
it('direct mode does NOT requeue when the piece model is available', () => {
expect(
shouldRequeueForModelMismatch({
isGateway: false,
pieceModel: 'qwen3:8b',
availableModels: new Set(['qwen3:8b', 'qwen3:32b']),
workerModel: 'qwen3:8b',
}),
).toBe(false);
});
it('direct mode does NOT requeue when the piece model equals the worker model (probe empty)', () => {
expect(
shouldRequeueForModelMismatch({
isGateway: false,
pieceModel: 'qwen3:8b',
availableModels: new Set(['other']),
workerModel: 'qwen3:8b',
}),
).toBe(false);
});
it('direct mode does NOT requeue when the piece pins no model', () => {
expect(
shouldRequeueForModelMismatch({
isGateway: false,
pieceModel: undefined,
availableModels: new Set(['qwen3:8b']),
workerModel: 'qwen3:8b',
}),
).toBe(false);
});
it('direct mode does NOT requeue before the model list is probed (size 0)', () => {
expect(
shouldRequeueForModelMismatch({
isGateway: false,
pieceModel: 'qwen3:32b',
availableModels: new Set(),
workerModel: '',
}),
).toBe(false);
});
});

61
src/llm/routing-key.ts Normal file
View File

@ -0,0 +1,61 @@
/**
* The single decision shared by every LLM call site (job execution, ASK
* answer, title generation, reflection): what to put in `request.model`.
*
* - Gateway (proxy) mode routes by ROLE. The AAO Gateway resolves the role
* (auto/fast/quality/reflection) to a least-busy backend declaring it, so
* a heterogeneous GPU pool stays grouped by performance tier instead of by
* exact model name. No /v1/models discovery, no model auto-select those
* collapsed every tier onto backends[0] (the 2026-06-09 over-concentration
* bug).
* - Direct mode sends the resolved model name (piece override, configured
* model, or omission for llama-server) exactly as before.
*
* The direct-model side is a thunk so gateway mode never runs the (logging,
* auto-select) resolution path. See
* docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
*/
export function llmRoutingKey(params: {
/** True when the worker connects through the AAO Gateway (connection_type: aao_gateway → proxy). */
isGateway: boolean;
/** The job's required tier (or the worker's serving tier for title generation). */
role: string | undefined;
/** Lazily resolves the direct-mode model. NOT called in gateway mode. */
resolveDirectModel: () => string | undefined;
/** Role key when `role` is undefined. Defaults to 'auto'. */
roleFallback?: string;
}): string | undefined {
if (params.isGateway) {
return params.role ?? params.roleFallback ?? 'auto';
}
return params.resolveDirectModel();
}
/**
* Should the worker requeue a job because the piece pins a model this worker
* cannot serve?
*
* Direct mode: requeue when the piece names a concrete model that is neither
* in the probed `availableModels` set nor the worker's own configured model
* another worker may serve it.
*
* Gateway mode: ALWAYS false. The worker routes by role and `availableModels`
* holds the gateway's advertised role set (not model names), so a piece that
* pins a real model would never match and the job would requeue forever. The
* gateway owns model selection there. See
* docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
*/
export function shouldRequeueForModelMismatch(params: {
isGateway: boolean;
pieceModel: string | undefined;
availableModels: Set<string>;
workerModel: string | undefined;
}): boolean {
if (params.isGateway) return false;
return (
!!params.pieceModel &&
params.availableModels.size > 0 &&
!params.availableModels.has(params.pieceModel) &&
params.workerModel !== params.pieceModel
);
}

View File

@ -19,6 +19,7 @@ import { logger } from './logger.js';
import { accessSync, existsSync, mkdirSync, constants } from 'fs';
import { dirname, resolve, join } from 'path';
import { OpenAICompatClient } from './llm/openai-compat.js';
import { llmRoutingKey } from './llm/routing-key.js';
import { ConfigManager } from './config-manager.js';
import { WorkerManager } from './worker-manager.js';
import { classifyPiece } from './engine/piece-classifier.js';
@ -127,10 +128,23 @@ export async function start(opts: StartWorkerOptions = {}): Promise<void> {
if (titleWorker) {
const titleModel = titleWorker.model ?? config.provider.model;
logger.info(`Config: title generation worker=${titleWorker.id} model=${titleModel ?? '<none>'}`);
// Gateway (proxy) mode routes by role, not model name. Title generation
// is cheap, so send a tier the title worker serves (first of auto/fast/
// quality), defaulting to 'auto'. Direct mode sends the configured model.
const titleTier =
titleWorker.roles?.find(r => r === 'auto' || r === 'fast' || r === 'quality') ?? 'auto';
const titleRoutingKey = llmRoutingKey({
isGateway: titleWorker.proxy === true,
role: titleTier,
resolveDirectModel: () => titleModel,
});
logger.info(
`Config: title generation worker=${titleWorker.id} key=${titleRoutingKey ?? '<none>'}` +
(titleWorker.proxy === true ? ' (gateway:role)' : ''),
);
titleClient = new OpenAICompatClient(
titleWorker.endpoint,
titleModel,
titleRoutingKey,
undefined,
config.provider.retry,
(config.provider.timeoutMinutes ?? 10) * 60 * 1000,

View File

@ -4,6 +4,7 @@ import { BrowserSessionRepo } from './db/browser-session-repo.js';
import { assertProfileOwner } from './engine/browser-session-auth.js';
import { initMasterKey, decryptUserDek, decryptStateBlob } from './crypto/sessions.js';
import { OpenAICompatClient } from './llm/openai-compat.js';
import { llmRoutingKey, shouldRequeueForModelMismatch } from './llm/routing-key.js';
import { loadPiece, runPiece, PieceRunCallbacks, PieceDef, type PieceRunResult } from './engine/piece-runner.js';
import { LocalProgressReporter } from './progress/local-reporter.js';
import { buildLocalConversationContext } from './engine/local-context.js';
@ -697,9 +698,14 @@ export class Worker {
const parentJob = await this.repo.getJob(parentJobId);
const parentInstruction = parentJob?.instruction ?? '(不明)';
const resolvedModel = this.model;
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
const workerDefForAnswer = this.getWorkerDef();
// Gateway routes by the subtask's tier; direct keeps the worker's model.
const resolvedModel = llmRoutingKey({
isGateway: workerDefForAnswer.proxy === true,
role: subtaskJob.requiredRole,
resolveDirectModel: () => this.model,
});
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
const llmClient = new OpenAICompatClient(
this.endpoint,
resolvedModel,
@ -954,11 +960,15 @@ export class Worker {
].filter((d): d is string => !!d);
logger.info(`[worker:${this.workerId}] job ${jobId} loadPiece piece=${job.pieceName} customDirs=[${customPieceDirs.join(', ') || 'none'}] piecesDir=pieces`);
const piece = loadPiece(job.pieceName, 'pieces', customPieceDirs);
// Model-mismatch requeue gate (direct mode only — gateway routes by
// role, see shouldRequeueForModelMismatch).
if (
piece.model &&
this.availableModels.size > 0 &&
!this.availableModels.has(piece.model) &&
this.model !== piece.model
shouldRequeueForModelMismatch({
isGateway: this.getWorkerDef().proxy === true,
pieceModel: piece.model,
availableModels: this.availableModels,
workerModel: this.model,
})
) {
await this.repo.updateJob(jobId, {
status: 'queued',
@ -996,11 +1006,16 @@ export class Worker {
return;
}
// Piece のモデル指定を解決
const resolvedModel = this.resolveModel(piece);
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
const workerDefForLlm = this.getWorkerDef();
const isProxyWorker = workerDefForLlm.proxy === true;
// Gateway routes by role; direct resolves the worker's model. The
// resolver thunk runs only in direct mode (no auto-select via gateway).
const resolvedModel = llmRoutingKey({
isGateway: isProxyWorker,
role: job.requiredRole,
resolveDirectModel: () => this.resolveModel(piece),
});
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
const llmClient = new OpenAICompatClient(
this.endpoint,
resolvedModel,
@ -1652,12 +1667,20 @@ export class Worker {
private async handleReflectionJob(job: Job): Promise<void> {
const { runReflectionJob } = await import('./engine/reflection/reflection-runner.js');
try {
// Gateway mode routes by role: send the reflection tier as the key
// (job.requiredRole is 'reflection'), not the worker's model name.
const reflectionRoutingKey = llmRoutingKey({
isGateway: this.getWorkerDef().proxy === true,
role: job.requiredRole,
resolveDirectModel: () => this.model,
roleFallback: 'reflection',
});
const outcome = await runReflectionJob(
{
repo: this.repo,
config: this.config,
llmEndpoint: this.endpoint,
llmModel: this.model,
llmModel: reflectionRoutingKey,
},
job
);

View File

@ -38,9 +38,28 @@ interface GatewayBackend {
endpoint?: string;
model?: string;
maxSlots?: number;
/** Performance tiers (Roles) this backend serves. Empty = serves all. */
roles?: string[];
apiKey?: string;
}
/** Format a roles array for the comma-separated text input. */
function rolesToInput(roles: string[] | undefined): string {
return Array.isArray(roles) ? roles.join(', ') : '';
}
/**
* Parse the comma-separated roles input into a deduped string array, or
* `undefined` when blank (= backend serves every role, migration-safe).
*/
function parseRolesInput(v: string): string[] | undefined {
const parts = v
.split(',')
.map(s => s.trim())
.filter(s => s.length > 0);
return parts.length > 0 ? [...new Set(parts)] : undefined;
}
interface GatewayConfigShape {
enabled?: boolean;
listenPort?: number;
@ -246,7 +265,7 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) {
</button>
</div>
<HelpText>
llama-server / Ollama / vLLM Gateway <code>request.model</code> <code>model</code> busy backend <br/>
llama-server / Ollama / vLLM Gateway worker <strong>role</strong> backend (<code>roles</code> backend role )role backend <code>request.model</code> = <code>id</code>/<code>model</code> <br/>
<strong>api_key </strong>: <code>config.yaml</code> <code>${'${VAR}'}</code> env var literal env <code>config.yaml</code>
</HelpText>
{backends.length === 0 ? (
@ -291,6 +310,17 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) {
placeholder="1"
/>
</div>
<div className="col-span-2">
<FieldLabel>roles ()</FieldLabel>
<FieldInput
value={rolesToInput(b.roles)}
onChange={v => updateBackend(i, 'roles', parseRolesInput(v))}
placeholder="quality, auto (空欄=全ロール)"
/>
<HelpText>
backend (<code>auto</code> / <code>fast</code> / <code>quality</code> / <code>reflection</code>) worker role routing key Gateway role backend <strong></strong> () model GPU role
</HelpText>
</div>
<div>
<FieldLabel>api_key ()</FieldLabel>
<FieldInput