sync: update from private repo (22cd034)

2026-06-09 06:02:32 +00:00 · 2026-06-09 06:02:32 +00:00 · 8ac98d2630
commit 8ac98d2630
parent 3848b5efd7
13 changed files with 589 additions and 37 deletions
--- a/config.yaml.example
+++ b/config.yaml.example
@ -117,14 +117,20 @@ llm:
 #   shutdown_graceful_sec: 30                # SIGTERM 後、in-flight SSE の drain 上限秒
 #
 #   backends:
-#     - id: gpu-a                            # `x-aao-backend-id` / `/v1/models` に出る ID
+#     # role ベース routing: worker はジョブの role (auto/fast/quality/reflection)
+#     # を routing key として送り、Gateway はその role を担う最も空いている
+#     # backend に振る。異なる model 名の GPU でも同じ role でまとめられる。
+#     # roles 未指定の backend は全 role を担当 (アップグレード後も従来どおり動く)。
+#     - id: gpu-a                            # `x-aao-backend-id` に出る ID
 #       endpoint: http://gpu-a:11434/v1
-#       model: qwen3:32b                     # 厳密一致 routing
+#       model: qwen3:32b                     # backend が積むモデル (role が無い時の fallback 一致キー)
+#       roles: [quality]                     # この backend は quality ティア専用
 #       max_slots: 2                         # llama-server -np と合わせる
 #       api_key: ${GPU_A_API_KEY}            # backend が bearer 必須な場合のみ
 #     - id: gpu-b
 #       endpoint: http://gpu-b:11434/v1
-#       model: qwen3:32b
+#       model: qwen3:8b                      # 別 GPU は別モデルでも OK
+#       roles: [fast, auto]                  # fast/auto ティアを担当
 #       max_slots: 2
 #
 #   # Bootstrap / Backup 専用 virtual_keys (新規発行は admin API 経由を推奨)。
--- a/src/gateway/bootstrap.ts
+++ b/src/gateway/bootstrap.ts
@ -30,6 +30,7 @@ import { runMigrations } from '../db/migrate.js';
 import {
  readGatewayConfig,
  validateGatewayConfig,
+  lintGatewayConfig,
  type GatewayBackendConfig,
  type GatewayConfig,
 } from './config.js';
@ -206,6 +207,11 @@ export async function start(opts: StartGatewayOptions = {}): Promise<StartedGate
    throw new Error(`gateway config has ${validationErrors.length} error(s); refusing to start`);
  }

+  // Non-fatal lint (role↔id/model collisions). Routes fine, just ambiguously.
+  for (const w of lintGatewayConfig(gatewayConfig)) {
+    logger.warn(`[gateway-bootstrap] config warning: ${w}`);
+  }
+
  logger.info(
    `[gateway-bootstrap] enabled listen_port=${gatewayConfig.listenPort} backends=${gatewayConfig.backends.length} virtual_keys=${gatewayConfig.virtualKeys.length}`,
  );
--- a/src/gateway/config.test.ts
+++ b/src/gateway/config.test.ts
@ -11,8 +11,11 @@ import { describe, it, expect } from 'vitest';
 import {
  normalizeGatewayConfig,
  validateGatewayConfig,
+  lintGatewayConfig,
+  backendServesRole,
  readGatewayConfig,
  DEFAULT_GATEWAY_LISTEN_PORT,
+  DEFAULT_GATEWAY_BACKEND_ROLES,
  type GatewayConfig,
 } from './config.js';
 import type { AppConfig } from '../config.js';
@ -72,6 +75,79 @@ describe('normalizeGatewayConfig', () => {
    const norm = normalizeGatewayConfig({ enabled: 1 as unknown as boolean });
    expect(norm.enabled).toBe(false);
  });
+
+  // --- Role-based routing (2026-06-09) ---
+
+  it('parses backends[].roles into a deduped string array', () => {
+    const norm = normalizeGatewayConfig({
+      backends: [
+        { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['quality', 'quality', 'auto'] },
+      ],
+    });
+    expect(norm.backends[0]!.roles).toEqual(['quality', 'auto']);
+  });
+
+  it('drops empty / non-string role entries', () => {
+    const norm = normalizeGatewayConfig({
+      backends: [
+        { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4, roles: ['fast', '', 7 as unknown as string, '  '] },
+      ],
+    });
+    expect(norm.backends[0]!.roles).toEqual(['fast']);
+  });
+
+  it('leaves roles undefined when not provided (migration safety = serves all)', () => {
+    const norm = normalizeGatewayConfig({
+      backends: [{ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 4 }],
+    });
+    expect(norm.backends[0]!.roles).toBeUndefined();
+  });
+});
+
+describe('backendServesRole', () => {
+  it('matches a declared role', () => {
+    expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'quality')).toBe(true);
+    expect(backendServesRole({ id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: ['quality'] }, 'fast')).toBe(false);
+  });
+
+  it('serves every default execution role when roles is unspecified', () => {
+    const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 };
+    for (const r of DEFAULT_GATEWAY_BACKEND_ROLES) {
+      expect(backendServesRole(b, r)).toBe(true);
+    }
+    expect(backendServesRole(b, 'not-a-role')).toBe(false);
+  });
+
+  it('treats an empty roles array as unspecified (serves all)', () => {
+    const b = { id: 'x', endpoint: 'http://x/v1', model: 'm', maxSlots: 1, roles: [] };
+    expect(backendServesRole(b, 'auto')).toBe(true);
+  });
+});
+
+describe('lintGatewayConfig', () => {
+  it('returns no warnings for a clean config', () => {
+    expect(lintGatewayConfig(makeValidConfig())).toEqual([]);
+  });
+
+  it('warns when a role name collides with another backend id', () => {
+    const c = makeValidConfig();
+    c.backends = [
+      { id: 'auto', endpoint: 'http://x/v1', model: 'm', maxSlots: 1 },
+      { id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['auto'] },
+    ];
+    const warns = lintGatewayConfig(c);
+    expect(warns.some(w => /auto/.test(w) && /collid/i.test(w))).toBe(true);
+  });
+
+  it('warns when a role name collides with a backend model', () => {
+    const c = makeValidConfig();
+    c.backends = [
+      { id: 'gpu-a', endpoint: 'http://x/v1', model: 'fast', maxSlots: 1 },
+      { id: 'gpu-b', endpoint: 'http://y/v1', model: 'm', maxSlots: 1, roles: ['fast'] },
+    ];
+    const warns = lintGatewayConfig(c);
+    expect(warns.some(w => /fast/.test(w) && /collid/i.test(w))).toBe(true);
+  });
 });

 describe('validateGatewayConfig', () => {
--- a/src/gateway/config.ts
+++ b/src/gateway/config.ts
@ -18,14 +18,45 @@ export interface GatewayBackendConfig {
  id: string;
  /** Upstream base URL — `/v1` suffix required so /chat/completions resolves cleanly. */
  endpoint: string;
-  /** Model name this backend serves. The gateway routes by strict equality on `request.model`. */
+  /** Model name this backend serves. Used as the backward-compat routing key (strict `request.model` equality). */
  model: string;
  /** Concurrent slots the upstream can serve. Mirrors llama-server's `-np`. */
  maxSlots: number;
+  /**
+   * Performance tiers (Roles) this backend serves: any of `auto`/`fast`/
+   * `quality`/`reflection`. A worker sends the job's required role as the
+   * routing key; the gateway routes role → least-busy backend declaring it.
+   * This lets a heterogeneous GPU pool (different model names per box) be
+   * grouped by tier rather than by exact model name.
+   *
+   * Undefined / empty = serves every default execution role
+   * (DEFAULT_GATEWAY_BACKEND_ROLES) — keeps a pre-roles single-tier config
+   * working unchanged after upgrade.
+   */
+  roles?: string[];
  /** Optional bearer to forward to the backend (Ollama is usually unauthed). */
  apiKey?: string;
 }

+/**
+ * Roles an `roles`-less backend is assumed to serve. Covers every routing
+ * key a worker actually emits (job roles auto/fast/quality/reflection;
+ * title is mapped to `auto` worker-side). Tier separation is opt-in: it
+ * only takes effect once a backend declares an explicit `roles` subset.
+ */
+export const DEFAULT_GATEWAY_BACKEND_ROLES: readonly string[] = ['auto', 'fast', 'quality', 'reflection'];
+
+/**
+ * Does `backend` serve `role`? A backend with no explicit `roles` serves
+ * the full DEFAULT_GATEWAY_BACKEND_ROLES set (migration safety). The
+ * router uses this for role-match candidate selection; falls back to
+ * strict id/model matching only when NO backend serves the key.
+ */
+export function backendServesRole(backend: GatewayBackendConfig, role: string): boolean {
+  const roles = backend.roles && backend.roles.length > 0 ? backend.roles : DEFAULT_GATEWAY_BACKEND_ROLES;
+  return roles.includes(role);
+}
+
 export interface GatewayVirtualKey {
  /** Bearer string a client sends in `Authorization: Bearer <key>`. */
  key: string;
@ -190,11 +221,22 @@ function normalizeBackend(b: GatewayBackendConfig): GatewayBackendConfig {
    typeof ms === 'number' && Number.isFinite(ms) && ms > 0
      ? Math.floor(ms)
      : NaN;
+  // Roles: keep undefined (= serves all) when absent, else dedupe and drop
+  // empty / non-string entries so a typo'd YAML list can't smuggle a blank
+  // role that matches nothing.
+  let roles: string[] | undefined;
+  if (Array.isArray(b.roles)) {
+    const cleaned = b.roles
+      .filter((r): r is string => typeof r === 'string' && r.trim().length > 0)
+      .map(r => r.trim());
+    roles = [...new Set(cleaned)];
+  }
  return {
    id: b.id,
    endpoint: b.endpoint,
    model: b.model,
    maxSlots,
+    ...(roles !== undefined ? { roles } : {}),
    apiKey: b.apiKey,
  };
 }
@ -281,6 +323,11 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] {
          `gateway.backends[${b.id}].max_slots must be a positive integer (got ${b.maxSlots})`,
        );
      }
+      if (b.roles !== undefined) {
+        if (!Array.isArray(b.roles) || b.roles.some(r => typeof r !== 'string' || r.trim() === '')) {
+          errors.push(`gateway.backends[${b.id}].roles must be an array of non-empty strings if defined`);
+        }
+      }
    }
  }

@ -351,6 +398,48 @@ export function validateGatewayConfig(cfg: GatewayConfig): string[] {
  return errors;
 }

+/**
+ * Non-fatal lint pass — returns human-readable WARNINGS (empty = clean).
+ * Separate from validateGatewayConfig (which returns boot-fatal errors)
+ * because these conditions still route, just ambiguously.
+ *
+ * Currently flags role-name ↔ backend-id/model collisions: because the
+ * router resolves a routing key by role membership FIRST and only falls
+ * back to id/model exact match, a backend whose id or model equals a role
+ * another backend serves can never be reached by that literal name. The
+ * operator probably didn't intend the shadowing — warn, don't block.
+ */
+export function lintGatewayConfig(cfg: GatewayConfig): string[] {
+  const warnings: string[] = [];
+  if (!Array.isArray(cfg.backends)) return warnings;
+
+  // Every role any backend declares (explicit or default-implied).
+  const declaredRoles = new Set<string>();
+  for (const b of cfg.backends) {
+    const roles = b.roles && b.roles.length > 0 ? b.roles : DEFAULT_GATEWAY_BACKEND_ROLES;
+    for (const r of roles) declaredRoles.add(r);
+  }
+
+  for (const b of cfg.backends) {
+    // Only flag EXPLICIT id/model collisions — a default-roles backend
+    // (which serves every role) isn't a "name" the operator chose to
+    // shadow, so don't drown them in warnings.
+    if (declaredRoles.has(b.id)) {
+      warnings.push(
+        `gateway.backends[${b.id}]: backend id '${b.id}' collides with a role name served by the pool — ` +
+          `role routing shadows it, so '${b.id}' is unreachable as a literal model name`,
+      );
+    }
+    if (b.model && b.model !== b.id && declaredRoles.has(b.model)) {
+      warnings.push(
+        `gateway.backends[${b.id}]: model '${b.model}' collides with a role name served by the pool — ` +
+          `role routing shadows it, so '${b.model}' is unreachable as a literal model name`,
+      );
+    }
+  }
+  return warnings;
+}
+
 /**
 * Pull the (possibly missing) gateway block off AppConfig. Used by
 * gateway/bootstrap.ts so both worker and gateway modes can call
--- a/src/gateway/models-endpoint.test.ts
+++ b/src/gateway/models-endpoint.test.ts
@ -11,9 +11,13 @@ describe('buildModelsResponse', () => {
    expect(resp.data).toEqual([]);
  });

-  it('maps each backend id to a model entry', () => {
+  it('maps each backend id to a model entry (id-shaped, not role-shaped)', () => {
+    // /v1/models stays backend-id-shaped: the Settings "physical backends"
+    // panel (config-api /api/workers/:id/backends) reads it to list the GPUs
+    // behind a proxy worker. Role-based routing lives in the router + worker,
+    // not in this discovery endpoint (2026-06-09 decision).
    const resp = buildModelsResponse([
-      { id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4 },
+      { id: 'gpu-a', endpoint: 'http://a/v1', model: 'qwen3:8b', maxSlots: 4, roles: ['quality'] },
      { id: 'gpu-b', endpoint: 'http://b/v1', model: 'qwen3:14b', maxSlots: 4 },
    ]);
    expect(resp.data).toEqual([
--- a/src/gateway/models-endpoint.ts
+++ b/src/gateway/models-endpoint.ts
@ -4,7 +4,15 @@
 * Returns one entry per `gateway.backends[]` with `id` set to the backend
 * id (which is also what `x-aao-backend-id` and `x-litellm-model-id` will
 * carry in responses). Clients hit this endpoint to discover which model
- * names they may request.
+ * names they may request, and `/api/workers/:id/backends` (the Settings
+ * "physical backends" panel) reads it to list the GPUs behind a proxy
+ * worker — so it stays backend-id-shaped, NOT role-shaped.
+ *
+ * Role-based routing (2026-06-09) lives entirely in the router + the worker
+ * (which sends a role as the routing key). The over-concentration bug was
+ * fixed worker-side by dropping model auto-select in gateway mode, so this
+ * endpoint did not need to change. A client may still request a role: the
+ * router's role-match arm handles it even though roles aren't advertised here.
 *
 * Auth required: the response surfaces backend ids which could otherwise
 * be enumerated by an unauthenticated probe.
--- a/src/gateway/router.test.ts
+++ b/src/gateway/router.test.ts
@ -41,6 +41,10 @@ function bk(id: string, model: string, maxSlots = 4): GatewayBackendConfig {
  return { id, endpoint: `http://${id}/v1`, model, maxSlots };
 }

+function bkRoles(id: string, model: string, roles: string[], maxSlots = 4): GatewayBackendConfig {
+  return { id, endpoint: `http://${id}/v1`, model, maxSlots, roles };
+}
+
 describe('createRouter.pick', () => {
  it('returns null when model has no configured backend', () => {
    const r = createRouter({
@ -195,6 +199,95 @@ describe('createRouter.pick', () => {
  });
 });

+describe('createRouter.pick — role-based routing (2026-06-09)', () => {
+  it('routes a role key only to backends that declare the role', () => {
+    // Heterogeneous pool: two GPUs serve the quality tier with DIFFERENT
+    // model names; one serves fast. A request for "quality" must never
+    // land on the fast-only backend even though it is the most idle.
+    const r = createRouter({
+      getBackends: () => [
+        bkRoles('q1', 'big-model-a.gguf', ['quality']),
+        bkRoles('q2', 'big-model-b.gguf', ['quality']),
+        bkRoles('f1', 'small-model.gguf', ['fast']),
+      ],
+      registry: makeRegistry([
+        status('q1', true, 3, 4), // ratio 0.75
+        status('q2', true, 1, 4), // ratio 0.25 (winner within the role)
+        status('f1', true, 0, 4), // idle but WRONG role
+      ]),
+    });
+    expect(r.pick('quality')?.id).toBe('q2');
+  });
+
+  it('returns null (503) when every backend serving the role is saturated — no spill to other tiers', () => {
+    const r = createRouter({
+      getBackends: () => [
+        bkRoles('q1', 'a.gguf', ['quality'], 2),
+        bkRoles('f1', 'b.gguf', ['fast'], 2),
+      ],
+      registry: makeRegistry([
+        status('q1', true, 2, 2), // quality full
+        status('f1', true, 0, 2), // fast idle — must NOT be picked for quality
+      ]),
+    });
+    expect(r.pick('quality')).toBeNull();
+  });
+
+  it('enforces per-id max_slots independently within a role', () => {
+    const inflight = createBackendInflightCounter();
+    const r = createRouter({
+      getBackends: () => [
+        bkRoles('q1', 'a.gguf', ['quality'], 1),
+        bkRoles('q2', 'b.gguf', ['quality'], 1),
+      ],
+      registry: makeRegistry([status('q1', true, 0, 1), status('q2', true, 0, 1)]),
+      inflight,
+    });
+    const p1 = r.pick('quality')!.id;
+    const p2 = r.pick('quality')!.id;
+    expect(new Set([p1, p2])).toEqual(new Set(['q1', 'q2'])); // one each
+    expect(r.pick('quality')).toBeNull(); // both reserved
+  });
+
+  it('treats a backend with unspecified roles as serving every role', () => {
+    const r = createRouter({
+      getBackends: () => [bk('generalist', 'm.gguf')], // no roles => all
+      registry: makeRegistry([status('generalist', true, 0, 4)]),
+    });
+    expect(r.pick('auto')?.id).toBe('generalist');
+    expect(r.pick('fast')?.id).toBe('generalist');
+    expect(r.pick('quality')?.id).toBe('generalist');
+  });
+
+  it('falls back to id/model exact match when NO backend declares the role (backward compat)', () => {
+    // A legacy caller sends a literal model name. No backend declares it
+    // as a role, so the router falls back to strict id/model matching.
+    const r = createRouter({
+      getBackends: () => [bkRoles('q1', 'legacy-model.gguf', ['quality'])],
+      registry: makeRegistry([status('q1', true, 0, 4)]),
+    });
+    expect(r.pick('legacy-model.gguf')?.id).toBe('q1'); // model-name fallback
+    expect(r.pick('q1')?.id).toBe('q1'); // id fallback
+  });
+
+  it('prefers role match over an id/model collision (role takes precedence)', () => {
+    // 'quality' is BOTH a declared role on roleBackend AND the id of a
+    // different backend. Role membership wins per decision 3.
+    const r = createRouter({
+      getBackends: () => [
+        bkRoles('roleBackend', 'm-a.gguf', ['quality']),
+        bkRoles('quality', 'm-b.gguf', ['fast']), // id literally "quality" but serves fast — the trap
+      ],
+      registry: makeRegistry([
+        status('roleBackend', true, 0, 4),
+        status('quality', true, 0, 4),
+      ]),
+    });
+    // Only the role-declaring backend is eligible; the id-collision one is ignored.
+    for (let i = 0; i < 5; i++) expect(r.pick('quality')?.id).toBe('roleBackend');
+  });
+});
+
 describe('createRouter.pick — live reservation (per-host max_slots enforcement)', () => {
  it('reserves a slot on pick so a maxSlots=1 backend saturates after one pick', () => {
    // Root-cause regression: before the fix, pick() judged saturation
--- a/src/gateway/router.ts
+++ b/src/gateway/router.ts
@ -2,16 +2,19 @@
 * Least-busy backend router.
 *
 * Selection algorithm:
- *  1. Filter to backends whose configured `id` OR `model` strictly
- *     matches `request.model`. Accepting both means a client can
- *     either name a specific deployment (`b.id`, mirrors LiteLLM's
- *     deployment-name convention used in our /v1/models + /health
- *     responses) or fall back to the generic model alias (`b.model`,
- *     for tools that pass through the raw model name configured on
- *     the worker side). Without the `b.id` arm, the gateway exposed
- *     ids via /v1/models that the worker would faithfully echo back
- *     in the chat request, and routing would 503 because nothing
- *     matched on b.model. Discovered during 2026-05-20 dogfooding.
+ *  1. Pick the candidate pool by ROLE first (2026-06-09). The routing
+ *     key (`request.model`) is normally a performance tier a worker
+ *     sends — auto/fast/quality/reflection. Backends declare which
+ *     tiers they serve via `backends[].roles`; a backend with no
+ *     `roles` serves them all. This groups a heterogeneous GPU pool
+ *     (different model names per box) by tier rather than by exact
+ *     model name. Membership is by config, not live load, so a fully
+ *     saturated tier returns 503 instead of spilling to another tier.
+ *     Only when NO backend serves the key do we fall back to the
+ *     legacy strict `id` OR `model` match — keeping pre-roles configs
+ *     and raw OpenAI clients (which name a deployment id/model alias,
+ *     mirrored in /v1/models + /health) working. The id arm fixed a
+ *     503 found during 2026-05-20 dogfooding.
 *  2. Drop backends the registry says are offline or saturated
 *     (`busy_slots >= total_slots`).
 *  3. Sort the survivors by `busy_slots / total_slots` ascending —
@ -27,7 +30,7 @@
 * predictable in logs. The bias is at most O(slots) over the lifetime
 * of one process, which is fine for a single binary.
 */
-import type { GatewayBackendConfig } from './config.js';
+import { backendServesRole, type GatewayBackendConfig } from './config.js';
 import type { BackendStatusRegistry, NodeStatus } from '../engine/backend-status-registry.js';

 export interface RouterSnapshotBackend {
@ -122,7 +125,24 @@ export function createRouter(deps: CreateRouterDeps): Router {
      deps.registry.noteSubscriberActivity?.();

      if (typeof model !== 'string' || model.length === 0) return null;
-      const backends = deps.getBackends();
+      const allBackends = deps.getBackends();
+      // Candidate pool selection (role-based, dual-match — 2026-06-09).
+      //
+      // The routing key (`model`) is normally a performance tier/role the
+      // worker sends (auto/fast/quality/reflection). Route to backends that
+      // DECLARE that role first; this groups a heterogeneous GPU pool by
+      // tier instead of by exact model name. Role membership is by config
+      // (backendServesRole), NOT by live availability — so if any backend
+      // serves the role, we stay inside that tier even when all of them are
+      // saturated (→ 503), rather than leaking the request to another tier.
+      //
+      // Only when NO backend serves the key do we fall back to the legacy
+      // strict id/model match. That keeps pre-roles configs and non-maestro
+      // OpenAI clients (which send a literal model name) working unchanged.
+      const roleMatched = allBackends.filter(b => backendServesRole(b, model));
+      const backends = roleMatched.length > 0
+        ? roleMatched
+        : allBackends.filter(b => b.id === model || b.model === model);
      // One getAll() per pick, not one per backend — the previous
      // `lookupStatus(b.id)` inside the loop did `registry.getAll()`
      // (which slices its internal cache) once per candidate. With N
@ -137,11 +157,9 @@ export function createRouter(deps: CreateRouterDeps): Router {
      const warm: RankedCandidate[] = [];
      const cold: RankedCandidate[] = [];
      for (const b of backends) {
-        // Match either the deployment id (LiteLLM-style routing key
-        // mirrored in /v1/models + /health) or the model alias the
-        // backend serves. See top-of-file docstring for why both arms
-        // are needed.
-        if (b.id !== model && b.model !== model) continue;
+        // `backends` is already filtered to the chosen candidate pool
+        // (role match, else id/model fallback) — no per-backend match
+        // check needed here.
        // Live reservation count for this backend (0 when no inflight
        // counter is wired). Used both to gate admission and to rank.
        const reserved = deps.inflight?.get(b.id) ?? 0;
--- a/src/llm/routing-key.test.ts
+++ b/src/llm/routing-key.test.ts
@ -0,0 +1,124 @@
+/**
+ * Routing-key selection tests.
+ *
+ * The single decision shared by every LLM call site: gateway (proxy) mode
+ * routes by Role; direct mode sends the resolved model name. See
+ * docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
+ */
+import { describe, it, expect, vi } from 'vitest';
+import { llmRoutingKey, shouldRequeueForModelMismatch } from './routing-key.js';
+
+describe('llmRoutingKey', () => {
+  it('gateway mode sends the role as the key', () => {
+    const key = llmRoutingKey({
+      isGateway: true,
+      role: 'quality',
+      resolveDirectModel: () => 'should-not-be-used.gguf',
+    });
+    expect(key).toBe('quality');
+  });
+
+  it('gateway mode never evaluates the direct-model resolver (no /v1/models, no auto-select)', () => {
+    const resolveDirectModel = vi.fn(() => 'm');
+    llmRoutingKey({ isGateway: true, role: 'fast', resolveDirectModel });
+    expect(resolveDirectModel).not.toHaveBeenCalled();
+  });
+
+  it('gateway mode falls back to "auto" when role is undefined', () => {
+    expect(
+      llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm' }),
+    ).toBe('auto');
+  });
+
+  it('gateway mode honours a custom role fallback (e.g. reflection)', () => {
+    expect(
+      llmRoutingKey({ isGateway: true, role: undefined, resolveDirectModel: () => 'm', roleFallback: 'reflection' }),
+    ).toBe('reflection');
+  });
+
+  it('direct mode returns the resolved model and ignores the role', () => {
+    const key = llmRoutingKey({
+      isGateway: false,
+      role: 'quality',
+      resolveDirectModel: () => 'qwen3:8b',
+    });
+    expect(key).toBe('qwen3:8b');
+  });
+
+  it('direct mode preserves an undefined model (llama-server model omission)', () => {
+    expect(
+      llmRoutingKey({ isGateway: false, role: 'auto', resolveDirectModel: () => undefined }),
+    ).toBeUndefined();
+  });
+});
+
+describe('shouldRequeueForModelMismatch', () => {
+  it('NEVER requeues in gateway mode — availableModels holds roles, gateway owns model selection', () => {
+    // The regression: in gateway mode availableModels = advertised roles
+    // (auto/fast/quality), so a piece pinning a concrete model would never
+    // match and the job would requeue forever.
+    expect(
+      shouldRequeueForModelMismatch({
+        isGateway: true,
+        pieceModel: 'qwen3:32b',
+        availableModels: new Set(['auto', 'fast', 'quality']),
+        workerModel: '',
+      }),
+    ).toBe(false);
+  });
+
+  it('direct mode requeues when the piece pins an unavailable model', () => {
+    expect(
+      shouldRequeueForModelMismatch({
+        isGateway: false,
+        pieceModel: 'qwen3:32b',
+        availableModels: new Set(['qwen3:8b']),
+        workerModel: 'qwen3:8b',
+      }),
+    ).toBe(true);
+  });
+
+  it('direct mode does NOT requeue when the piece model is available', () => {
+    expect(
+      shouldRequeueForModelMismatch({
+        isGateway: false,
+        pieceModel: 'qwen3:8b',
+        availableModels: new Set(['qwen3:8b', 'qwen3:32b']),
+        workerModel: 'qwen3:8b',
+      }),
+    ).toBe(false);
+  });
+
+  it('direct mode does NOT requeue when the piece model equals the worker model (probe empty)', () => {
+    expect(
+      shouldRequeueForModelMismatch({
+        isGateway: false,
+        pieceModel: 'qwen3:8b',
+        availableModels: new Set(['other']),
+        workerModel: 'qwen3:8b',
+      }),
+    ).toBe(false);
+  });
+
+  it('direct mode does NOT requeue when the piece pins no model', () => {
+    expect(
+      shouldRequeueForModelMismatch({
+        isGateway: false,
+        pieceModel: undefined,
+        availableModels: new Set(['qwen3:8b']),
+        workerModel: 'qwen3:8b',
+      }),
+    ).toBe(false);
+  });
+
+  it('direct mode does NOT requeue before the model list is probed (size 0)', () => {
+    expect(
+      shouldRequeueForModelMismatch({
+        isGateway: false,
+        pieceModel: 'qwen3:32b',
+        availableModels: new Set(),
+        workerModel: '',
+      }),
+    ).toBe(false);
+  });
+});
--- a/src/llm/routing-key.ts
+++ b/src/llm/routing-key.ts
@ -0,0 +1,61 @@
+/**
+ * The single decision shared by every LLM call site (job execution, ASK
+ * answer, title generation, reflection): what to put in `request.model`.
+ *
+ *  - Gateway (proxy) mode routes by ROLE. The AAO Gateway resolves the role
+ *    (auto/fast/quality/reflection) to a least-busy backend declaring it, so
+ *    a heterogeneous GPU pool stays grouped by performance tier instead of by
+ *    exact model name. No /v1/models discovery, no model auto-select — those
+ *    collapsed every tier onto backends[0] (the 2026-06-09 over-concentration
+ *    bug).
+ *  - Direct mode sends the resolved model name (piece override, configured
+ *    model, or omission for llama-server) exactly as before.
+ *
+ * The direct-model side is a thunk so gateway mode never runs the (logging,
+ * auto-select) resolution path. See
+ * docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
+ */
+export function llmRoutingKey(params: {
+  /** True when the worker connects through the AAO Gateway (connection_type: aao_gateway → proxy). */
+  isGateway: boolean;
+  /** The job's required tier (or the worker's serving tier for title generation). */
+  role: string | undefined;
+  /** Lazily resolves the direct-mode model. NOT called in gateway mode. */
+  resolveDirectModel: () => string | undefined;
+  /** Role key when `role` is undefined. Defaults to 'auto'. */
+  roleFallback?: string;
+}): string | undefined {
+  if (params.isGateway) {
+    return params.role ?? params.roleFallback ?? 'auto';
+  }
+  return params.resolveDirectModel();
+}
+
+/**
+ * Should the worker requeue a job because the piece pins a model this worker
+ * cannot serve?
+ *
+ * Direct mode: requeue when the piece names a concrete model that is neither
+ * in the probed `availableModels` set nor the worker's own configured model
+ * — another worker may serve it.
+ *
+ * Gateway mode: ALWAYS false. The worker routes by role and `availableModels`
+ * holds the gateway's advertised role set (not model names), so a piece that
+ * pins a real model would never match and the job would requeue forever. The
+ * gateway owns model selection there. See
+ * docs/superpowers/plans/2026-06-09-role-based-gateway-routing.md.
+ */
+export function shouldRequeueForModelMismatch(params: {
+  isGateway: boolean;
+  pieceModel: string | undefined;
+  availableModels: Set<string>;
+  workerModel: string | undefined;
+}): boolean {
+  if (params.isGateway) return false;
+  return (
+    !!params.pieceModel &&
+    params.availableModels.size > 0 &&
+    !params.availableModels.has(params.pieceModel) &&
+    params.workerModel !== params.pieceModel
+  );
+}
--- a/src/worker-bootstrap.ts
+++ b/src/worker-bootstrap.ts
@ -19,6 +19,7 @@ import { logger } from './logger.js';
 import { accessSync, existsSync, mkdirSync, constants } from 'fs';
 import { dirname, resolve, join } from 'path';
 import { OpenAICompatClient } from './llm/openai-compat.js';
+import { llmRoutingKey } from './llm/routing-key.js';
 import { ConfigManager } from './config-manager.js';
 import { WorkerManager } from './worker-manager.js';
 import { classifyPiece } from './engine/piece-classifier.js';
@ -127,10 +128,23 @@ export async function start(opts: StartWorkerOptions = {}): Promise<void> {

  if (titleWorker) {
    const titleModel = titleWorker.model ?? config.provider.model;
-    logger.info(`Config: title generation worker=${titleWorker.id} model=${titleModel ?? '<none>'}`);
+    // Gateway (proxy) mode routes by role, not model name. Title generation
+    // is cheap, so send a tier the title worker serves (first of auto/fast/
+    // quality), defaulting to 'auto'. Direct mode sends the configured model.
+    const titleTier =
+      titleWorker.roles?.find(r => r === 'auto' || r === 'fast' || r === 'quality') ?? 'auto';
+    const titleRoutingKey = llmRoutingKey({
+      isGateway: titleWorker.proxy === true,
+      role: titleTier,
+      resolveDirectModel: () => titleModel,
+    });
+    logger.info(
+      `Config: title generation worker=${titleWorker.id} key=${titleRoutingKey ?? '<none>'}` +
+        (titleWorker.proxy === true ? ' (gateway:role)' : ''),
+    );
    titleClient = new OpenAICompatClient(
      titleWorker.endpoint,
-      titleModel,
+      titleRoutingKey,
      undefined,
      config.provider.retry,
      (config.provider.timeoutMinutes ?? 10) * 60 * 1000,
--- a/src/worker.ts
+++ b/src/worker.ts
@ -4,6 +4,7 @@ import { BrowserSessionRepo } from './db/browser-session-repo.js';
 import { assertProfileOwner } from './engine/browser-session-auth.js';
 import { initMasterKey, decryptUserDek, decryptStateBlob } from './crypto/sessions.js';
 import { OpenAICompatClient } from './llm/openai-compat.js';
+import { llmRoutingKey, shouldRequeueForModelMismatch } from './llm/routing-key.js';
 import { loadPiece, runPiece, PieceRunCallbacks, PieceDef, type PieceRunResult } from './engine/piece-runner.js';
 import { LocalProgressReporter } from './progress/local-reporter.js';
 import { buildLocalConversationContext } from './engine/local-context.js';
@ -697,9 +698,14 @@ export class Worker {
    const parentJob = await this.repo.getJob(parentJobId);
    const parentInstruction = parentJob?.instruction ?? '(不明)';

-    const resolvedModel = this.model;
-    const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
    const workerDefForAnswer = this.getWorkerDef();
+    // Gateway routes by the subtask's tier; direct keeps the worker's model.
+    const resolvedModel = llmRoutingKey({
+      isGateway: workerDefForAnswer.proxy === true,
+      role: subtaskJob.requiredRole,
+      resolveDirectModel: () => this.model,
+    });
+    const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
    const llmClient = new OpenAICompatClient(
      this.endpoint,
      resolvedModel,
@ -954,11 +960,15 @@ export class Worker {
      ].filter((d): d is string => !!d);
      logger.info(`[worker:${this.workerId}] job ${jobId} loadPiece piece=${job.pieceName} customDirs=[${customPieceDirs.join(', ') || 'none'}] piecesDir=pieces`);
      const piece = loadPiece(job.pieceName, 'pieces', customPieceDirs);
+      // Model-mismatch requeue gate (direct mode only — gateway routes by
+      // role, see shouldRequeueForModelMismatch).
      if (
-        piece.model &&
-        this.availableModels.size > 0 &&
-        !this.availableModels.has(piece.model) &&
-        this.model !== piece.model
+        shouldRequeueForModelMismatch({
+          isGateway: this.getWorkerDef().proxy === true,
+          pieceModel: piece.model,
+          availableModels: this.availableModels,
+          workerModel: this.model,
+        })
      ) {
        await this.repo.updateJob(jobId, {
          status: 'queued',
@ -996,11 +1006,16 @@ export class Worker {
        return;
      }

-      // Piece のモデル指定を解決
-      const resolvedModel = this.resolveModel(piece);
-      const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
      const workerDefForLlm = this.getWorkerDef();
      const isProxyWorker = workerDefForLlm.proxy === true;
+      // Gateway routes by role; direct resolves the worker's model. The
+      // resolver thunk runs only in direct mode (no auto-select via gateway).
+      const resolvedModel = llmRoutingKey({
+        isGateway: isProxyWorker,
+        role: job.requiredRole,
+        resolveDirectModel: () => this.resolveModel(piece),
+      });
+      const timeoutMs = (this.config.provider.timeoutMinutes ?? 10) * 60 * 1000;
      const llmClient = new OpenAICompatClient(
        this.endpoint,
        resolvedModel,
@ -1652,12 +1667,20 @@ export class Worker {
  private async handleReflectionJob(job: Job): Promise<void> {
    const { runReflectionJob } = await import('./engine/reflection/reflection-runner.js');
    try {
+      // Gateway mode routes by role: send the reflection tier as the key
+      // (job.requiredRole is 'reflection'), not the worker's model name.
+      const reflectionRoutingKey = llmRoutingKey({
+        isGateway: this.getWorkerDef().proxy === true,
+        role: job.requiredRole,
+        resolveDirectModel: () => this.model,
+        roleFallback: 'reflection',
+      });
      const outcome = await runReflectionJob(
        {
          repo: this.repo,
          config: this.config,
          llmEndpoint: this.endpoint,
-          llmModel: this.model,
+          llmModel: reflectionRoutingKey,
        },
        job
      );
--- a/ui/src/components/settings/GatewayServerForm.tsx
+++ b/ui/src/components/settings/GatewayServerForm.tsx
@ -38,9 +38,28 @@ interface GatewayBackend {
  endpoint?: string;
  model?: string;
  maxSlots?: number;
+  /** Performance tiers (Roles) this backend serves. Empty = serves all. */
+  roles?: string[];
  apiKey?: string;
 }

+/** Format a roles array for the comma-separated text input. */
+function rolesToInput(roles: string[] | undefined): string {
+  return Array.isArray(roles) ? roles.join(', ') : '';
+}
+
+/**
+ * Parse the comma-separated roles input into a deduped string array, or
+ * `undefined` when blank (= backend serves every role, migration-safe).
+ */
+function parseRolesInput(v: string): string[] | undefined {
+  const parts = v
+    .split(',')
+    .map(s => s.trim())
+    .filter(s => s.length > 0);
+  return parts.length > 0 ? [...new Set(parts)] : undefined;
+}
+
 interface GatewayConfigShape {
  enabled?: boolean;
  listenPort?: number;
@ -246,7 +265,7 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) {
          </button>
        </div>
        <HelpText>
-          ルーティング先の llama-server / Ollama / vLLM など。Gateway は <code>request.model</code> に一致する <code>model</code> を持つ最も busy ではない backend に割り振ります。<br/>
+          ルーティング先の llama-server / Ollama / vLLM など。Gateway は worker が送る <strong>role</strong> を担う backend のうち最も空いているものに割り振ります (<code>roles</code> 未設定の backend は全 role 対応)。role を担う backend が無い場合のみ <code>request.model</code> = <code>id</code>/<code>model</code> の厳密一致にフォールバックします。<br/>
          <strong>api_key の保存形式</strong>: フォームで入力した値は <code>config.yaml</code> に平文で保存されます。<code>${'${VAR}'}</code> 形式の env var 参照はフォーム保存時に literal 文字列として保存されるため、env 経由で渡したい場合は <code>config.yaml</code> を直接編集してください。
        </HelpText>
        {backends.length === 0 ? (
@ -291,6 +310,17 @@ export function GatewayServerForm({ config, onChange }: SectionFormProps) {
                        placeholder="1"
                      />
                    </div>
+                    <div className="col-span-2">
+                      <FieldLabel>roles (任意)</FieldLabel>
+                      <FieldInput
+                        value={rolesToInput(b.roles)}
+                        onChange={v => updateBackend(i, 'roles', parseRolesInput(v))}
+                        placeholder="quality, auto (空欄=全ロール)"
+                      />
+                      <HelpText>
+                        この backend が担う性能ティア (<code>auto</code> / <code>fast</code> / <code>quality</code> / <code>reflection</code>) をカンマ区切りで。worker はジョブの role を routing key として送り、Gateway がその role を担う最も空いている backend に振ります。<strong>空欄なら全ロール</strong>を担当 (従来どおり)。異なる model 名の GPU でも同じ role でまとめられます。
+                      </HelpText>
+                    </div>
                    <div>
                      <FieldLabel>api_key (任意)</FieldLabel>
                      <FieldInput