sync: update from private repo (22cd034)
Some checks failed
CI / build-and-test (push) Has been cancelled
Some checks failed
CI / build-and-test (push) Has been cancelled
This commit is contained in:
parent
3848b5efd7
commit
8ac98d2630
@ -117,14 +117,20 @@ llm:
|
||||
# shutdown_graceful_sec: 30 # SIGTERM 後、in-flight SSE の drain 上限秒
|
||||
#
|
||||
# backends:
|
||||
# - id: gpu-a # `x-aao-backend-id` / `/v1/models` に出る ID
|
||||
# # role ベース routing: worker はジョブの role (auto/fast/quality/reflection)
|
||||
# # を routing key として送り、Gateway はその role を担う最も空いている
|
||||
# # backend に振る。異なる model 名の GPU でも同じ role でまとめられる。
|
||||
# # roles 未指定の backend は全 role を担当 (アップグレード後も従来どおり動く)。
|
||||
# - id: gpu-a # `x-aao-backend-id` に出る ID
|
||||
# endpoint: http://gpu-a:11434/v1
|
||||
# model: qwen3:32b # 厳密一致 routing
|
||||
# model: qwen3:32b # backend が積むモデル (role が無い時の fallback 一致キー)
|
||||
# roles: [quality] # この backend は quality ティア専用
|
||||
# max_slots: 2 # llama-server -np と合わせる
|
||||
# api_key: ${GPU_A_API_KEY} # backend が bearer 必須な場合のみ
|
||||
# - id: gpu-b
|
||||
# endpoint: http://gpu-b:11434/v1
|
||||
# model: qwen3:32b
|
||||
# model: qwen3:8b # 別 GPU は別モデルでも OK
|
||||
# roles: [fast, auto] # fast/auto ティアを担当
|
||||
# max_slots: 2
|
||||
#
|
||||
# # Bootstrap / Backup 専用 virtual_keys (新規発行は admin API 経由を推奨)。
|
||||
|
||||
@ -30,6 +30,7 @@ import { runMigrations } from '../db/migrate.js';
|
||||
import {
|
||||
readGatewayConfig,
|
||||
validateGatewayConfig,
|
||||
lintGatewayConfig,
|
||||
type GatewayBackendConfig,
|
||||
type GatewayConfig,
|
||||
} from './config.js';
|
||||
@ -206,6 +207,11 @@ export async function start(opts: StartGatewayOptions = {}): Promise<StartedGate
|
||||
throw new Error(`gateway config has ${validationErrors.length} error(s); refusing to start`);
|
||||
}
|
||||
|
||||
// Non-fatal lint (role↔id/model collisions). Routes fine, just ambiguously.
|
||||
for (const w of lintGatewayConfig(gatewayConfig)) {
|
||||
logger.warn(`[gateway-bootstrap] config warning: ${w}`);
|
||||
}
|
||||
|
||||
logger.info(
|
||||
`[gateway-bootstrap] enabled listen_port=${gatewayConfig.listenPort} backends=${gatewayConfig.backends.length} virtual_keys=${gatewayConfig.virtualKeys.length}`,
|
||||
);
|
||||
|
||||
@ -11,8 +11,11 @@ import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
normalizeGatewayConfig,
|
||||
validateGatewayConfig,
|
||||
lintGatewayConfig,
|
||||
backendServesRole,
|
||||
readGatewayConfig,
|
||||
DEFAULT_GATEWAY_LISTEN_PORT,
|
||||
DEFAULT_GATEWAY_BACKEND_ROLES,
|
||||
type GatewayConfig,
|
||||
} from './config.js';
|
||||
import type { AppConfig } from '../config.js';
|
||||
@ -72,6 +75,79 @@ describe('normalizeGatewayConfig', () => {
|
||||
const norm = normalizeGatewayConfig({ enabled: 1 as unknown as boolean });
|
||||
expect(norm.enabled).toBe(false);
|
||||
});
|
||||
|
||||
// --- Role-based routing (2026-06-09) ---
|
||||
|
||||
it('parses backends[].roles into a deduped string array', () => {
|
||||
const norm = normalizeGatewayConfig({
|
||||
backends: [
|
||||
{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['quality', 'quality', 'auto'] },
|
||||
],
|
||||
});
|
||||
expect(norm.backends[0]!.roles).toEqual(['quality', 'auto']);
|
||||
});
|
||||
|
||||
it('drops empty / non-string role entries', () => {
|
||||
const norm = normalizeGatewayConfig({
|
||||
backends: [
|
||||
{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['fast', '', 7 as unknown as string, ' '] },
|
||||
],
|
||||
});
|
||||
expect(norm.backends[0]!.roles).toEqual(['fast']);
|
||||
});
|
||||
|
||||
it('leaves roles undefined when not provided (migration safety = serves all)', () => {
|
||||
const norm = normalizeGatewayConfig({
|
||||
backends: [{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4 }],
|
||||
});
|
||||
expect(norm.backends[0]!.roles).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('backendServesRole', () => {
|
||||
it('matches a declared role', () => {
|
||||
expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'quality')).toBe(true);
|
||||
expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'fast')).toBe(false);
|
||||
});
|
||||
|
||||
it('serves every default execution role when roles is unspecified', () => {
|
||||
const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 };
|
||||
for (const r of DEFAULT_GATEWAY_BACKEND_ROLES) {
|
||||
expect(backendServesRole(b, r)).toBe(true);
|
||||
}
|
||||
expect(backendServesRole(b, 'not-a-role')).toBe(false);
|
||||
});
|
||||
|
||||
it('treats an empty roles array as unspecified (serves all)', () => {
|
||||
const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: [] };
|
||||
expect(backendServesRole(b, 'auto')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('lintGatewayConfig', () => {
|
||||
it('returns no warnings for a clean config', () => {
|
||||
expect(lintGatewayConfig(makeValidConfig())).toEqual([]);
|
||||
});
|
||||
|
||||
it('warns when a role name collides with another backend id', () => {
|
||||
const c = makeValidConfig();
|
||||
c.backends = [
|
||||
{ id: 'auto', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 },
|
||||
{ id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['auto'] },
|
||||
];
|
||||
const warns = lintGatewayConfig(c);
|
||||
expect(warns.some(w => /auto/.test(w) && /collid/i.test(w))).toBe(true);
|
||||
});
|
||||
|
||||
it('warns when a role name collides with a backend model', () => {
|
||||
const c = makeValidConfig();
|
||||
c.backends = [
|
||||
{ id: 'gpu-a', endpoint: 'http://x/v1', model: 'fast', maxSlots: 1 },
|
||||
{ id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['fast'] },
|
||||
];
|
||||
const warns = lintGatewayConfig(c);
|
||||
expect(warns.some(w => /fast/.test(w) && /collid/i.test(w))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('validateGatewayConfig', () => {
|
||||
|
||||
@ -18,14 +18,45 @@ export interface GatewayBackendConfig {
|
||||
id: string;
|
||||
/** Upstream base URL — `/v1` suffix required so /chat/completions resolves cleanly. */
|
||||
endpoint: string;
|
||||
/** Model name this backend serves. The gateway routes by strict equality on `request.model`. */
|
||||
/** Model name this backend serves. Used as the backward-compat routing key (strict `request.model` equality). */
|
||||
model: string;
|
||||
/** Concurrent slots the upstream can serve. Mirrors llama-server's `-np`. */
|
||||
maxSlots: number;
|
||||
/**
|
||||
* Performance tiers (Roles) this backend serves: any of `auto`/`fast`/
|
||||
* `quality`/`reflection`. A worker sends the job's required role as the
|
||||
* routing key; the gateway routes role → least-busy backend declaring it.
|
||||
* This lets a heterogeneous GPU pool (different model names per box) be
|
||||
* grouped by tier rather than by exact model name.
|
||||
*
|
||||
* Undefined / empty = serves every default execution role
|
||||
* (DEFAULT_GATEWAY_BACKEND_ROLES) — keeps a pre-roles single-tier config
|
||||
* working unchanged after upgrade.
|
||||
*/
|
||||
roles?: string[];
|
||||
/** Optional bearer to forward to the backend (Ollama is usually unauthed). */
|
||||
apiKey?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Roles an `roles`-less backend is assumed to serve. Covers every routing
|
||||
* key a worker actually emits (job roles auto/fast/quality/reflection;
|
||||
* title is mapped to `auto` worker-side). Tier separation is opt-in: it
|
||||
* only takes effect once a backend declares an explicit `roles` subset.
|
||||
*/
|
||||
export const DEFAULT_GATEWAY_BACKEND_ROLES: readonly string[] = ['auto', 'fast', 'quality', 'reflection'];
|
||||
|
||||
/**
|
||||
* Does `backend` serve `role`? A backend with no explicit `roles` serves
|
||||
* the full DEFAULT_GATEWAY_BACKEND_ROLES set (migration safety). The
|
||||
* router uses this for role-match candidate selection; falls back to
|
||||
* strict id/model matching only when NO backend serves the key.
|
||||
*/
|
||||
export function backendServesRole(backend: GatewayBackendConfig, role: string): boolean {
|
||||
const roles = backend.roles && backend.roles.length > 0 ? backend.roles : DEFAULT_GATEWAY_BACKEND_ROLES;
|
||||
return roles.includes(role);
|
||||
}
|
||||
|
||||
export interface GatewayVirtualKey {
|
||||
/** Bearer string a client sends in `Authorization: Bearer <key>`. */
|
||||
key: string;
|
||||
@ -190,11 +221,22 @@ function normalizeBackend(b: GatewayBackendConfig): GatewayBackendConfig {
|
||||
typeof ms === 'number' && Number.isFinite(ms) && ms > 0
|
||||
? Math.floor(ms)
|
||||
: NaN;
|
||||
// Roles: keep undefined (= serves all) when absent, else dedupe and drop
|
||||
// empty / non-string entries so a typo'd YAML list can't smuggle a blank
|
||||
// role that matches nothing.
|
||||
let roles: string[] | undefined;
|
||||
if (Array.isArray(b.roles)) {
|
||||
const cleaned = b.roles
|
||||
.filter((r): r is string => typeof r === 'string' && r.trim().length > 0)
|
||||
.map(r => r.trim());
|
||||
roles = [...new Set(cleaned)];
|
||||
}
|
||||
return {
|
||||
id: b.id,
|
||||
endpoint: b.endpoint,
|
||||
model: b.model,
|
||||
maxSlots,
|
||||
...(roles !== undefined ? { roles } : {}),
|
||||
apiKey: b.apiKey,
|
||||
};
|
||||
}
|
||||
@ -281,6 +323,11 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] {
|
||||
`gateway.backends[${b.id}].max_slots must be a positive integer (got ${b.maxSlots})`,
|
||||
);
|
||||
}
|
||||
if (b.roles !== undefined) {
|
||||
if (!Array.isArray(b.roles) || b.roles.some(r => typeof r !== 'string' || r.trim() === '')) {
|
||||
errors.push(`gateway.backends[${b.id}].roles must be an array of non-empty strings if defined`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -351,6 +398,48 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] {
|
||||
return errors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-fatal lint pass — returns human-readable WARNINGS (empty = clean).
|
||||
* Separate from validateGatewayConfig (which returns boot-fatal errors)
|
||||
* because these conditions still route, just ambiguously.
|
||||
*
|
||||
* Currently flags role-name ↔ backend-id/model collisions: because the
|
||||
* router resolves a routing key by role membership FIRST and only falls
|
||||
* back to id/model exact match, a backend whose id or model equals a role
|
||||
* another backend serves can never be reached by that literal name. The
|
||||
* operator probably didn't intend the shadowing — warn, don't block.
|
||||
*/
|
||||
export function lintGatewayConfig(cfg: GatewayConfig): string[] {
|
||||
const warnings: string[] = [];
|
||||
if (!Array.isArray(cfg.backends)) return warnings;
|
||||
|
||||
// Every role any backend declares (explicit or default-implied).
|
||||
const declaredRoles = new Set<string>();
|
||||
for (const b of cfg.backends) {
|
||||
const roles = b.roles && b.roles.length > 0 ? b.roles : DEFAULT_GATEWAY_BACKEND_ROLES;
|
||||
for (const r of roles) declaredRoles.add(r);
|
||||
}
|
||||
|
||||
for (const b of cfg.backends) {
|
||||
// Only flag EXPLICIT id/model collisions — a default-roles backend
|
||||
// (which serves every role) isn't a "name" the operator chose to
|
||||
// shadow, so don't drown them in warnings.
|
||||
if (declaredRoles.has(b.id)) {
|
||||
warnings.push(
|
||||
`gateway.backends[${b.id}]: backend id '${b.id}' collides with a role name served by the pool — ` +
|
||||
`role routing shadows it, so '${b.id}' is unreachable as a literal model name`,
|
||||
);
|
||||
}
|
||||
if (b.model && b.model !== b.id && declaredRoles.has(b.model)) {
|
||||
warnings.push(
|
||||
`gateway.backends[${b.id}]: model '${b.model}' collides with a role name served by the pool — ` +
|
||||
`role routing shadows it, so '${b.model}' is unreachable as a literal model name`,
|
||||
);
|
||||
}
|
||||
}
|
||||
return warnings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull the (possibly missing) gateway block off AppConfig. Used by
|
||||
* gateway/bootstrap.ts so both worker and gateway modes can call
|
||||
|
||||
@ -11,9 +11,13 @@ describe('buildModelsResponse', () => {
|
||||
expect(resp.data).toEqual([]);
|
||||
});
|
||||
|
||||
it('maps each backend id to a model entry', () => {
|
||||
it('maps each backend id to a model entry (id-shaped, not role-shaped)', () => {
|
||||
// /v1/models stays backend-id-shaped: the Settings "physical backends"
|
||||
// panel (config-api /api/workers/:id/backends) reads it to list the GPUs
|
||||
// behind a proxy worker. Role-based routing lives in the router + worker,
|
||||
// not in this discovery endpoint (2026-06-09 decision).
|
||||
const resp = buildModelsResponse([
|
||||
{ id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4 },
|
||||
{ id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4, roles: ['quality'] },
|
||||
{ id: 'gpu-b', endpoint: 'http://b/v1', model: 'qwen3:14b', maxSlots: 4 },
|
||||
]);
|
||||
expect(resp.data).toEqual([
|
||||
|
||||
@ -4,7 +4,15 @@
|
||||
* Returns one entry per `gateway.backends[]` with `id` set to the backend
|
||||
* id (which is also what `x-aao-backend-id` and `x-litellm-model-id` will
|
||||
* carry in responses). Clients hit this endpoint to discover which model
|
||||
* names they may request.
|
||||
* names they may request, and `/api/workers/:id/backends` (the Settings
|
||||
* "physical backends" panel) reads it to list the GPUs behind a proxy
|
||||
* worker — so it stays backend-id-shaped, NOT role-shaped.
|
||||
*
|
||||
* Role-based routing (2026-06-09) lives entirely in the router + the worker
|
||||
* (which sends a role as the routing key). The over-concentration bug was
|
||||
* fixed worker-side by dropping model auto-select in gateway mode, so this
|
||||
* endpoint did not need to change. A client may still request a role: the
|
||||
* router's role-match arm handles it even though roles aren't advertised here.
|
||||
*
|
||||
* Auth required: the response surfaces backend ids which could otherwise
|
||||
* be enumerated by an unauthenticated probe.
|
||||
|
||||
@ -41,6 +41,10 @@ function bk(id: string, model: string, maxSlots = 4): GatewayBackendConfig {
|
||||
return { id, endpoint: `http://${id}/v1`, model, maxSlots };
|
||||
}
|
||||
|
||||
function bkRoles(id: string, model: string, roles: string[], maxSlots = 4): GatewayBackendConfig {
|
||||
return { id, endpoint: `http://${id}/v1`, model, maxSlots, roles };
|
||||
}
|
||||
|
||||
describe('createRouter.pick', () => {
|
||||
it('returns null when model has no configured backend', () => {
|
||||
const r = createRouter({
|
||||
@ -195,6 +199,95 @@ describe('createRouter.pick', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('createRouter.pick — role-based routing (2026-06-09)', () => {
|
||||
it('routes a role key only to backends that declare the role', () => {
|
||||
// Heterogeneous pool: two GPUs serve the quality tier with DIFFERENT
|
||||
// model names; one serves fast. A request for "quality" must never
|
||||
// land on the fast-only backend even though it is the most idle.
|
||||
const r = createRouter({
|
||||
getBackends: () => [
|
||||
bkRoles('q1', 'big-model-a.gguf', ['quality']),
|
||||
bkRoles('q2', 'big-model-b.gguf', ['quality']),
|
||||
bkRoles('f1', 'small-model.gguf', ['fast']),
|
||||
],
|
||||
registry: makeRegistry([
|
||||
status('q1', true, 3, 4), // ratio 0.75
|
||||
status('q2', true, 1, 4), // ratio 0.25 (winner within the role)
|
||||
status('f1', true, 0, 4), // idle but WRONG role
|
||||
]),
|
||||
});
|
||||
expect(r.pick('quality')?.id).toBe('q2');
|
||||
});
|
||||
|
||||
it('returns null (503) when every backend serving the role is saturated — no spill to other tiers', () => {
|
||||
const r = createRouter({
|
||||
getBackends: () => [
|
||||
bkRoles('q1', 'a.gguf', ['quality'], 2),
|
||||
bkRoles('f1', 'b.gguf', ['fast'], 2),
|
||||
],
|
||||
registry: makeRegistry([
|
||||
status('q1', true, 2, 2), // quality full
|
||||
status('f1', true, 0, 2), // fast idle — must NOT be picked for quality
|
||||
]),
|
||||
});
|
||||
expect(r.pick('quality')).toBeNull();
|
||||
});
|
||||
|
||||
it('enforces per-id max_slots independently within a role', () => {
|
||||
const inflight = createBackendInflightCounter();
|
||||
const r = createRouter({
|
||||
getBackends: () => [
|
||||
bkRoles('q1', 'a.gguf', ['quality'], 1),
|
||||
bkRoles('q2', 'b.gguf', ['quality'], 1),
|
||||
],
|
||||
registry: makeRegistry([status('q1', true, 0, 1), status('q2', true, 0, 1)]),
|
||||
inflight,
|
||||
});
|
||||
const p1 = r.pick('quality')!.id;
|
||||
const p2 = r.pick('quality')!.id;
|
||||
expect(new Set([p1, p2])).toEqual(new Set(['q1', 'q2'])); // one each
|
||||
expect(r.pick('quality')).toBeNull(); // both reserved
|
||||
});
|
||||
|
||||
it('treats a backend with unspecified roles as serving every role', () => {
|
||||
const r = createRouter({
|
||||
getBackends: () => [bk('generalist', 'm.gguf')], // no roles => all
|
||||
registry: makeRegistry([status('generalist', true, 0, 4)]),
|
||||
});
|
||||
expect(r.pick('auto')?.id).toBe('generalist');
|
||||
expect(r.pick('fast')?.id).toBe('generalist');
|
||||
expect(r.pick('quality')?.id).toBe('generalist');
|
||||
});
|
||||
|
||||
it('falls back to id/model exact match when NO backend declares the role (backward compat)', () => {
|
||||
// A legacy caller sends a literal model name. No backend declares it
|
||||
// as a role, so the router falls back to strict id/model matching.
|
||||
const r = createRouter({
|
||||
getBackends: () => [bkRoles('q1', 'legacy-model.gguf', ['quality'])],
|
||||
registry: makeRegistry([status('q1', true, 0, 4)]),
|
||||
});
|
||||
expect(r.pick('legacy-model.gguf')?.id).toBe('q1'); // model-name fallback
|
||||
expect(r.pick('q1')?.id).toBe('q1'); // id fallback
|
||||
});
|
||||
|
||||
it('prefers role match over an id/model collision (role takes precedence)', () => {
|
||||
// 'quality' is BOTH a declared role on roleBackend AND the id of a
|
||||
// different backend. Role membership wins per decision 3.
|
||||
const r = createRouter({
|
||||
getBackends: () => [
|
||||
bkRoles('roleBackend', 'm-a.gguf', ['quality']),
|
||||
bkRoles('quality', 'm-b.gguf', ['fast']), // id literally "quality" but serves fast — the trap
|
||||
],
|
||||
registry: makeRegistry([
|
||||
status('roleBackend', true, 0, 4),
|
||||
status('quality', true, 0, 4),
|
||||
]),
|
||||
});
|
||||
// Only the role-declaring backend is eligible; the id-collision one is ignored.
|
||||
for (let i = 0; i < 5; i++) expect(r.pick('quality')?.id).toBe('roleBackend');
|
||||
});
|
||||
});
|
||||
|
||||
describe('createRouter.pick — live reservation (per-host max_slots enforcement)', () => {
|
||||
it('reserves a slot on pick so a maxSlots=1 backend saturates after one pick', () => {
|
||||
// Root-cause regression: before the fix, pick() judged saturation
|
||||
|
||||
@ -2,16 +2,19 @@
|
||||
* Least-busy backend router.
|
||||
*
|
||||
* Selection algorithm:
|
||||
* 1. Filter to backends whose configured `id` OR `model` strictly
|
||||
* matches `request.model`. Accepting both means a client can
|
||||
* either name a specific deployment (`b.id`, mirrors LiteLLM's
|
||||
* deployment-name convention used in our /v1/models + /health
|
||||
* responses) or fall back to the generic model alias (`b.model`,
|
||||
* for tools that pass through the raw model name configured on
|
||||
* the worker side). Without the `b.id` arm, the gateway exposed
|
||||
* ids via /v1/models that the worker would faithfully echo back
|
||||
* in the chat request, and routing would 503 because nothing
|
||||
* matched on b.model. Discovered during 2026-05-20 dogfooding.
|
||||
* 1. Pick the candidate pool by ROLE first (2026-06-09). The routing
|
||||
* key (`request.model`) is normally a performance tier a worker
|
||||
* sends — auto/fast/quality/reflection. Backends declare which
|
||||
* tiers they serve via `backends[].roles`; a backend with no
|
||||
* `roles` serves them all. This groups a heterogeneous GPU pool
|
||||
* (different model names per box) by tier rather than by exact
|
||||
* model name. Membership is by config, not live load, so a fully
|
||||
* saturated tier returns 503 instead of spilling to another tier.
|
||||
* Only when NO backend serves the key do we fall back to the
|
||||
* legacy strict `id` OR `model` match — keeping pre-roles configs
|
||||
* and raw OpenAI clients (which name a deployment id/model alias,
|
||||
* mirrored in /v1/models + /health) working. The id arm fixed a
|
||||
* 503 found during 2026-05-20 dogfooding.
|
||||
* 2. Drop backends the registry says are offline or saturated
|
||||
* (`busy_slots >= total_slots`).
|
||||
* 3. Sort the survivors by `busy_slots / total_slots` ascending —
|
||||
@ -27,7 +30,7 @@
|
||||
* predictable in logs. The bias is at most O(slots) over the lifetime
|
||||
* of one process, which is fine for a single binary.
|
||||
*/
|
||||
import type { GatewayBackendConfig } from './config.js';
|
||||
import { backendServesRole, type GatewayBackendConfig } from './config.js';
|
||||
import type { BackendStatusRegistry, NodeStatus } from '../engine/backend-status-registry.js';
|
||||
|
||||
export interface RouterSnapshotBackend {
|
||||
@ -122,7 +125,24 @@ export function createRouter(deps: CreateRouterDeps): Router {
|
||||
deps.registry.noteSubscriberActivity?.();
|
||||
|
||||
if (typeof model !== 'string' || model.length === 0) return null;
|
||||
const backends = deps.getBackends();
|
||||
const allBackends = deps.getBackends();
|
||||
// Candidate pool selection (role-based, dual-match — 2026-06-09).
|
||||
//
|
||||
// The routing key (`model`) is normally a performance tier/role the
|
||||
// worker sends (auto/fast/quality/reflection). Route to backends that
|
||||
// DECLARE that role first; this groups a heterogeneous GPU pool by
|
||||
// tier instead of by exact model name. Role membership is by config
|
||||
// (backendServesRole), NOT by live availability — so if any backend
|
||||
// serves the role, we stay inside that tier even when all of them are
|
||||
// saturated (→ 503), rather than leaking the request to another tier.
|
||||
//
|
||||
// Only when NO backend serves the key do we fall back to the legacy
|
||||
// strict id/model match. That keeps pre-roles configs and non-maestro
|
||||
// OpenAI clients (which send a literal model name) working unchanged.
|
||||
const roleMatched = allBackends.filter(b => backendServesRole(b, model));
|
||||
const backends = roleMatched.length > 0
|
||||
? roleMatched
|
||||
: allBackends.filter(b => b.id === model || b.model === model);
|
||||
// One getAll() per pick, not one per backend — the previous
|
||||
// `lookupStatus(b.id)` inside the loop did `registry.getAll()`
|
||||
// (which slices its internal cache) once per candidate. With N
|
||||
@ -137,11 +157,9 @@ export function createRouter(deps: CreateRouterDeps): Router {
|
||||
const warm: RankedCandidate[] = [];
|
||||
const cold: RankedCandidate[] = [];
|
||||
for (const b of backends) {
|
||||
// Match either the deployment id (LiteLLM-style routing key
|
||||
// mirrored in /v1/models + /health) or the model alias the
|
||||
// backend serves. See top-of-file docstring for why both arms
|
||||
// are needed.
|
||||
if (b.id !== model && b.model !== model) continue;
|
||||
// `backends` is already filtered to the chosen candidate pool
|
||||
// (role match, else id/model fallback) — no per-backend match
|
||||
// check needed here.
|
||||
// Live reservation count for this backend (0 when no inflight
|
||||
// counter is wired). Used both to gate admission and to rank.
|
||||
const reserved = deps.inflight?.get(b.id) ?? 0;
|
||||
|
||||
124
src/llm/routing-key.test.ts
Normal file
124
src/llm/routing-key.test.ts
Normal file
@ -0,0 +1,124 @@
|
||||
/**
|
||||
* Routing-key selection tests.
|
||||
*
|
||||
* The single decision shared by every LLM call site: gateway (proxy) mode
|
||||
* routes by Role; direct mode sends the resolved model name. See
|
||||
* docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
|
||||
*/
|
||||
import { describe, it, expect, vi } from 'vitest';
|
||||
import { llmRoutingKey, shouldRequeueForModelMismatch } from './routing-key.js';
|
||||
|
||||
describe('llmRoutingKey', () => {
|
||||
it('gateway mode sends the role as the key', () => {
|
||||
const key = llmRoutingKey({
|
||||
isGateway: true,
|
||||
role: 'quality',
|
||||
resolveDirectModel: () => 'should-not-be-used.gguf',
|
||||
});
|
||||
expect(key).toBe('quality');
|
||||
});
|
||||
|
||||
it('gateway mode never evaluates the direct-model resolver (no /v1/models, no auto-select)', () => {
|
||||
const resolveDirectModel = vi.fn(() => 'm');
|
||||
llmRoutingKey({ isGateway: true, role: 'fast', resolveDirectModel });
|
||||
expect(resolveDirectModel).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('gateway mode falls back to "auto" when role is undefined', () => {
|
||||
expect(
|
||||
llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm' }),
|
||||
).toBe('auto');
|
||||
});
|
||||
|
||||
it('gateway mode honours a custom role fallback (e.g. reflection)', () => {
|
||||
expect(
|
||||
llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm', roleFallback: 'reflection' }),
|
||||
).toBe('reflection');
|
||||
});
|
||||
|
||||
it('direct mode returns the resolved model and ignores the role', () => {
|
||||
const key = llmRoutingKey({
|
||||
isGateway: false,
|
||||
role: 'quality',
|
||||
resolveDirectModel: () => 'qwen3:8b',
|
||||
});
|
||||
expect(key).toBe('qwen3:8b');
|
||||
});
|
||||
|
||||
it('direct mode preserves an undefined model (llama-server model omission)', () => {
|
||||
expect(
|
||||
llmRoutingKey({ isGateway: false, role: 'auto', resolveDirectModel: () => undefined }),
|
||||
).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('shouldRequeueForModelMismatch', () => {
|
||||
it('NEVER requeues in gateway mode — availableModels holds roles, gateway owns model selection', () => {
|
||||
// The regression: in gateway mode availableModels = advertised roles
|
||||
// (auto/fast/quality), so a piece pinning a concrete model would never
|
||||
// match and the job would requeue forever.
|
||||
expect(
|
||||
shouldRequeueForModelMismatch({
|
||||
isGateway: true,
|
||||
pieceModel: 'qwen3:32b',
|
||||
availableModels: new Set(['auto', 'fast', 'quality']),
|
||||
workerModel: '',
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('direct mode requeues when the piece pins an unavailable model', () => {
|
||||
expect(
|
||||
shouldRequeueForModelMismatch({
|
||||
isGateway: false,
|
||||
pieceModel: 'qwen3:32b',
|
||||
availableModels: new Set(['qwen3:8b']),
|
||||
workerModel: 'qwen3:8b',
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it('direct mode does NOT requeue when the piece model is available', () => {
|
||||
expect(
|
||||
shouldRequeueForModelMismatch({
|
||||
isGateway: false,
|
||||
pieceModel: 'qwen3:8b',
|
||||
availableModels: new Set(['qwen3:8b', 'qwen3:32b']),
|
||||
workerModel: 'qwen3:8b',
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('direct mode does NOT requeue when the piece model equals the worker model (probe empty)', () => {
|
||||
expect(
|
||||
shouldRequeueForModelMismatch({
|
||||
isGateway: false,
|
||||
pieceModel: 'qwen3:8b',
|
||||
availableModels: new Set(['other']),
|
||||
workerModel: 'qwen3:8b',
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('direct mode does NOT requeue when the piece pins no model', () => {
|
||||
expect(
|
||||
shouldRequeueForModelMismatch({
|
||||
isGateway: false,
|
||||
pieceModel: undefined,
|
||||
availableModels: new Set(['qwen3:8b']),
|
||||
workerModel: 'qwen3:8b',
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('direct mode does NOT requeue before the model list is probed (size 0)', () => {
|
||||
expect(
|
||||
shouldRequeueForModelMismatch({
|
||||
isGateway: false,
|
||||
pieceModel: 'qwen3:32b',
|
||||
availableModels: new Set(),
|
||||
workerModel: '',
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
});
|
||||
61
src/llm/routing-key.ts
Normal file
61
src/llm/routing-key.ts
Normal file
@ -0,0 +1,61 @@
|
||||
/**
|
||||
* The single decision shared by every LLM call site (job execution, ASK
|
||||
* answer, title generation, reflection): what to put in `request.model`.
|
||||
*
|
||||
* - Gateway (proxy) mode routes by ROLE. The AAO Gateway resolves the role
|
||||
* (auto/fast/quality/reflection) to a least-busy backend declaring it, so
|
||||
* a heterogeneous GPU pool stays grouped by performance tier instead of by
|
||||
* exact model name. No /v1/models discovery, no model auto-select — those
|
||||
* collapsed every tier onto backends[0] (the 2026-06-09 over-concentration
|
||||
* bug).
|
||||
* - Direct mode sends the resolved model name (piece override, configured
|
||||
* model, or omission for llama-server) exactly as before.
|
||||
*
|
||||
* The direct-model side is a thunk so gateway mode never runs the (logging,
|
||||
* auto-select) resolution path. See
|
||||
* docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
|
||||
*/
|
||||
export function llmRoutingKey(params: {
|
||||
/** True when the worker connects through the AAO Gateway (connection_type: aao_gateway → proxy). */
|
||||
isGateway: boolean;
|
||||
/** The job's required tier (or the worker's serving tier for title generation). */
|
||||
role: string | undefined;
|
||||
/** Lazily resolves the direct-mode model. NOT called in gateway mode. */
|
||||
resolveDirectModel: () => string | undefined;
|
||||
/** Role key when `role` is undefined. Defaults to 'auto'. */
|
||||
roleFallback?: string;
|
||||
}): string | undefined {
|
||||
if (params.isGateway) {
|
||||
return params.role ?? params.roleFallback ?? 'auto';
|
||||
}
|
||||
return params.resolveDirectModel();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should the worker requeue a job because the piece pins a model this worker
|
||||
* cannot serve?
|
||||
*
|
||||
* Direct mode: requeue when the piece names a concrete model that is neither
|
||||
* in the probed `availableModels` set nor the worker's own configured model
|
||||
* — another worker may serve it.
|
||||
*
|
||||
* Gateway mode: ALWAYS false. The worker routes by role and `availableModels`
|
||||
* holds the gateway's advertised role set (not model names), so a piece that
|
||||
* pins a real model would never match and the job would requeue forever. The
|
||||
* gateway owns model selection there. See
|
||||
* docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
|
||||
*/
|
||||
export function shouldRequeueForModelMismatch(params: {
|
||||
isGateway: boolean;
|
||||
pieceModel: string | undefined;
|
||||
availableModels: Set<string>;
|
||||
workerModel: string | undefined;
|
||||
}): boolean {
|
||||
if (params.isGateway) return false;
|
||||
return (
|
||||
!!params.pieceModel &&
|
||||
params.availableModels.size > 0 &&
|
||||
!params.availableModels.has(params.pieceModel) &&
|
||||
params.workerModel !== params.pieceModel
|
||||
);
|
||||
}
|
||||
@ -19,6 +19,7 @@ import { logger } from './logger.js';
|
||||
import { accessSync, existsSync, mkdirSync, constants } from 'fs';
|
||||
import { dirname, resolve, join } from 'path';
|
||||
import { OpenAICompatClient } from './llm/openai-compat.js';
|
||||
import { llmRoutingKey } from './llm/routing-key.js';
|
||||
import { ConfigManager } from './config-manager.js';
|
||||
import { WorkerManager } from './worker-manager.js';
|
||||
import { classifyPiece } from './engine/piece-classifier.js';
|
||||
@ -127,10 +128,23 @@ export async function start(opts: StartWorkerOptions = {}): Promise<void> {
|
||||
|
||||
if (titleWorker) {
|
||||
const titleModel = titleWorker.model ?? config.provider.model;
|
||||
logger.info(`Config: title generation worker=${titleWorker.id} model=${titleModel ?? '<none>'}`);
|
||||
// Gateway (proxy) mode routes by role, not model name. Title generation
|
||||
// is cheap, so send a tier the title worker serves (first of auto/fast/
|
||||
// quality), defaulting to 'auto'. Direct mode sends the configured model.
|
||||
const titleTier =
|
||||
titleWorker.roles?.find(r => r === 'auto' || r === 'fast' || r === 'quality') ?? 'auto';
|
||||
const titleRoutingKey = llmRoutingKey({
|
||||
isGateway: titleWorker.proxy === true,
|
||||
role: titleTier,
|
||||
resolveDirectModel: () => titleModel,
|
||||
});
|
||||
logger.info(
|
||||
`Config: title generation worker=${titleWorker.id} key=${titleRoutingKey ?? '<none>'}` +
|
||||
(titleWorker.proxy === true ? ' (gateway:role)' : ''),
|
||||
);
|
||||
titleClient = new OpenAICompatClient(
|
||||
titleWorker.endpoint,
|
||||
titleModel,
|
||||
titleRoutingKey,
|
||||
undefined,
|
||||
config.provider.retry,
|
||||
(config.provider.timeoutMinutes ?? 10) * 60 * 1000,
|
||||
|
||||
@ -4,6 +4,7 @@ import { BrowserSessionRepo } from './db/browser-session-repo.js';
|
||||
import { assertProfileOwner } from './engine/browser-session-auth.js';
|
||||
import { initMasterKey, decryptUserDek, decryptStateBlob } from './crypto/sessions.js';
|
||||
import { OpenAICompatClient } from './llm/openai-compat.js';
|
||||
import { llmRoutingKey, shouldRequeueForModelMismatch } from './llm/routing-key.js';
|
||||
import { loadPiece, runPiece, PieceRunCallbacks, PieceDef, type PieceRunResult } from './engine/piece-runner.js';
|
||||
import { LocalProgressReporter } from './progress/local-reporter.js';
|
||||
import { buildLocalConversationContext } from './engine/local-context.js';
|
||||
@ -697,9 +698,14 @@ export class Worker {
|
||||
const parentJob = await this.repo.getJob(parentJobId);
|
||||
const parentInstruction = parentJob?.instruction ?? '(不明)';
|
||||
|
||||
const resolvedModel = this.model;
|
||||
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
|
||||
const workerDefForAnswer = this.getWorkerDef();
|
||||
// Gateway routes by the subtask's tier; direct keeps the worker's model.
|
||||
const resolvedModel = llmRoutingKey({
|
||||
isGateway: workerDefForAnswer.proxy === true,
|
||||
role: subtaskJob.requiredRole,
|
||||
resolveDirectModel: () => this.model,
|
||||
});
|
||||
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
|
||||
const llmClient = new OpenAICompatClient(
|
||||
this.endpoint,
|
||||
resolvedModel,
|
||||
@ -954,11 +960,15 @@ export class Worker {
|
||||
].filter((d): d is string => !!d);
|
||||
logger.info(`[worker:${this.workerId}] job ${jobId} loadPiece piece=${job.pieceName} customDirs=[${customPieceDirs.join(', ') || 'none'}] piecesDir=pieces`);
|
||||
const piece = loadPiece(job.pieceName, 'pieces', customPieceDirs);
|
||||
// Model-mismatch requeue gate (direct mode only — gateway routes by
|
||||
// role, see shouldRequeueForModelMismatch).
|
||||
if (
|
||||
piece.model &&
|
||||
this.availableModels.size > 0 &&
|
||||
!this.availableModels.has(piece.model) &&
|
||||
this.model !== piece.model
|
||||
shouldRequeueForModelMismatch({
|
||||
isGateway: this.getWorkerDef().proxy === true,
|
||||
pieceModel: piece.model,
|
||||
availableModels: this.availableModels,
|
||||
workerModel: this.model,
|
||||
})
|
||||
) {
|
||||
await this.repo.updateJob(jobId, {
|
||||
status: 'queued',
|
||||
@ -996,11 +1006,16 @@ export class Worker {
|
||||
return;
|
||||
}
|
||||
|
||||
// Piece のモデル指定を解決
|
||||
const resolvedModel = this.resolveModel(piece);
|
||||
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
|
||||
const workerDefForLlm = this.getWorkerDef();
|
||||
const isProxyWorker = workerDefForLlm.proxy === true;
|
||||
// Gateway routes by role; direct resolves the worker's model. The
|
||||
// resolver thunk runs only in direct mode (no auto-select via gateway).
|
||||
const resolvedModel = llmRoutingKey({
|
||||
isGateway: isProxyWorker,
|
||||
role: job.requiredRole,
|
||||
resolveDirectModel: () => this.resolveModel(piece),
|
||||
});
|
||||
const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
|
||||
const llmClient = new OpenAICompatClient(
|
||||
this.endpoint,
|
||||
resolvedModel,
|
||||
@ -1652,12 +1667,20 @@ export class Worker {
|
||||
private async handleReflectionJob(job: Job): Promise<void> {
|
||||
const { runReflectionJob } = await import('./engine/reflection/reflection-runner.js');
|
||||
try {
|
||||
// Gateway mode routes by role: send the reflection tier as the key
|
||||
// (job.requiredRole is 'reflection'), not the worker's model name.
|
||||
const reflectionRoutingKey = llmRoutingKey({
|
||||
isGateway: this.getWorkerDef().proxy === true,
|
||||
role: job.requiredRole,
|
||||
resolveDirectModel: () => this.model,
|
||||
roleFallback: 'reflection',
|
||||
});
|
||||
const outcome = await runReflectionJob(
|
||||
{
|
||||
repo: this.repo,
|
||||
config: this.config,
|
||||
llmEndpoint: this.endpoint,
|
||||
llmModel: this.model,
|
||||
llmModel: reflectionRoutingKey,
|
||||
},
|
||||
job
|
||||
);
|
||||
|
||||
@ -38,9 +38,28 @@ interface GatewayBackend {
|
||||
endpoint?: string;
|
||||
model?: string;
|
||||
maxSlots?: number;
|
||||
/** Performance tiers (Roles) this backend serves. Empty = serves all. */
|
||||
roles?: string[];
|
||||
apiKey?: string;
|
||||
}
|
||||
|
||||
/** Format a roles array for the comma-separated text input. */
|
||||
function rolesToInput(roles: string[] | undefined): string {
|
||||
return Array.isArray(roles) ? roles.join(', ') : '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the comma-separated roles input into a deduped string array, or
|
||||
* `undefined` when blank (= backend serves every role, migration-safe).
|
||||
*/
|
||||
function parseRolesInput(v: string): string[] | undefined {
|
||||
const parts = v
|
||||
.split(',')
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0);
|
||||
return parts.length > 0 ? [...new Set(parts)] : undefined;
|
||||
}
|
||||
|
||||
interface GatewayConfigShape {
|
||||
enabled?: boolean;
|
||||
listenPort?: number;
|
||||
@ -246,7 +265,7 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) {
|
||||
</button>
|
||||
</div>
|
||||
<HelpText>
|
||||
ルーティング先の llama-server / Ollama / vLLM など。Gateway は <code>request.model</code> に一致する <code>model</code> を持つ最も busy ではない backend に割り振ります。<br/>
|
||||
ルーティング先の llama-server / Ollama / vLLM など。Gateway は worker が送る <strong>role</strong> を担う backend のうち最も空いているものに割り振ります (<code>roles</code> 未設定の backend は全 role 対応)。role を担う backend が無い場合のみ <code>request.model</code> = <code>id</code>/<code>model</code> の厳密一致にフォールバックします。<br/>
|
||||
<strong>api_key の保存形式</strong>: フォームで入力した値は <code>config.yaml</code> に平文で保存されます。<code>${'${VAR}'}</code> 形式の env var 参照はフォーム保存時に literal 文字列として保存されるため、env 経由で渡したい場合は <code>config.yaml</code> を直接編集してください。
|
||||
</HelpText>
|
||||
{backends.length === 0 ? (
|
||||
@ -291,6 +310,17 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) {
|
||||
placeholder="1"
|
||||
/>
|
||||
</div>
|
||||
<div className="col-span-2">
|
||||
<FieldLabel>roles (任意)</FieldLabel>
|
||||
<FieldInput
|
||||
value={rolesToInput(b.roles)}
|
||||
onChange={v => updateBackend(i, 'roles', parseRolesInput(v))}
|
||||
placeholder="quality, auto (空欄=全ロール)"
|
||||
/>
|
||||
<HelpText>
|
||||
この backend が担う性能ティア (<code>auto</code> / <code>fast</code> / <code>quality</code> / <code>reflection</code>) をカンマ区切りで。worker はジョブの role を routing key として送り、Gateway がその role を担う最も空いている backend に振ります。<strong>空欄なら全ロール</strong>を担当 (従来どおり)。異なる model 名の GPU でも同じ role でまとめられます。
|
||||
</HelpText>
|
||||
</div>
|
||||
<div>
|
||||
<FieldLabel>api_key (任意)</FieldLabel>
|
||||
<FieldInput
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user