sync: update from private repo (402599f)
Some checks failed
CI / build-and-test (push) Has been cancelled
Some checks failed
CI / build-and-test (push) Has been cancelled
This commit is contained in:
parent
57685d995c
commit
c526adddc2
30
.dockerignore
Normal file
30
.dockerignore
Normal file
@ -0,0 +1,30 @@
|
||||
.git
|
||||
.gitea
|
||||
.claude
|
||||
.code-review-graph
|
||||
.context
|
||||
.gstack
|
||||
.superpowers
|
||||
.worktrees
|
||||
|
||||
node_modules
|
||||
ui/node_modules
|
||||
dist
|
||||
ui/dist
|
||||
vendor
|
||||
|
||||
.env
|
||||
config.yaml
|
||||
data
|
||||
input
|
||||
logs
|
||||
output
|
||||
*.db
|
||||
*.db-shm
|
||||
*.db-wal
|
||||
core
|
||||
core.*
|
||||
|
||||
docs/plans
|
||||
docs/superpowers
|
||||
oss
|
||||
@ -1,3 +1,6 @@
|
||||
GITEA_API_TOKEN=your-gitea-token
|
||||
GITEA_WEBHOOK_SECRET=your-webhook-secret
|
||||
OLLAMA_BASE_URL=http://localhost:11434/v1
|
||||
# Docker Compose reaches an LLM running on the host through this address.
|
||||
OLLAMA_BASE_URL=http://host.docker.internal:11434/v1
|
||||
OLLAMA_MODEL=qwen3:32b
|
||||
|
||||
# Required only when enabling MCP or SSH.
|
||||
# MCP_ENCRYPTION_KEY=replace-with-64-hex-characters
|
||||
|
||||
26
.gitea/workflows/ci.yml
Normal file
26
.gitea/workflows/ci.yml
Normal file
@ -0,0 +1,26 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
build-and-test:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
cache: npm
|
||||
cache-dependency-path: |
|
||||
package-lock.json
|
||||
ui/package-lock.json
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
npm ci
|
||||
npm --prefix ui ci
|
||||
- name: Build
|
||||
run: npm run build:all -- --skip-python
|
||||
- name: Test
|
||||
run: npm test
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,6 @@
|
||||
node_modules/
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
dist/
|
||||
data/
|
||||
.env
|
||||
|
||||
28
Dockerfile
28
Dockerfile
@ -2,7 +2,6 @@ FROM node:22-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 依存関係のインストール
|
||||
COPY package.json package-lock.json* ./
|
||||
COPY ui/package.json ui/package-lock.json* ./ui/
|
||||
RUN npm ci --ignore-scripts
|
||||
@ -19,14 +18,12 @@ RUN apk add --no-cache --virtual .novnc-fetch curl tar \
|
||||
&& test -f /app/vendor/noVNC/vnc.html \
|
||||
&& apk del .novnc-fetch
|
||||
|
||||
# TypeScript ビルド
|
||||
COPY tsconfig.json ./
|
||||
COPY src ./src
|
||||
COPY ui ./ui
|
||||
RUN npm run build:server
|
||||
RUN npm run build:ui
|
||||
|
||||
# --- ランタイムステージ ---
|
||||
FROM node:22-alpine AS runtime
|
||||
|
||||
RUN apk add --no-cache \
|
||||
@ -36,7 +33,8 @@ RUN apk add --no-cache \
|
||||
bash \
|
||||
bubblewrap \
|
||||
python3 \
|
||||
py3-pip
|
||||
py3-pip \
|
||||
&& apk add --no-cache --virtual .native-build-deps build-base
|
||||
|
||||
# Pre-bake python packages into the system site-packages (read-only bind-mounted
|
||||
# into every bash sandbox). Runtime `pip install` is intentionally unsupported.
|
||||
@ -46,22 +44,22 @@ RUN pip3 install --no-cache-dir --break-system-packages -r /tmp/python-requireme
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 本番依存のみインストール
|
||||
COPY package.json package-lock.json* ./
|
||||
RUN npm ci --omit=dev --ignore-scripts
|
||||
RUN npm ci --omit=dev \
|
||||
&& npm cache clean --force \
|
||||
&& apk del .native-build-deps
|
||||
|
||||
# ビルド済み成果物をコピー
|
||||
COPY --from=builder /app/dist ./dist
|
||||
COPY --from=builder /app/ui/dist ./ui/dist
|
||||
COPY --from=builder /app/vendor ./vendor
|
||||
# schema.sql は dist に含まれないため個別コピー
|
||||
COPY src/db/schema.sql ./dist/db/schema.sql
|
||||
COPY pieces ./pieces
|
||||
COPY docs ./docs
|
||||
|
||||
# デフォルト設定
|
||||
COPY config.yaml ./
|
||||
# Ship a runnable default while still allowing a read-only config mount.
|
||||
COPY config.yaml.example ./config.yaml
|
||||
|
||||
# データ永続化ディレクトリ
|
||||
RUN mkdir -p /data /workspaces
|
||||
RUN mkdir -p /data /workspaces \
|
||||
&& chown -R node:node /data /workspaces
|
||||
|
||||
ENV NODE_ENV=production \
|
||||
PORT=9876 \
|
||||
@ -69,4 +67,6 @@ ENV NODE_ENV=production \
|
||||
|
||||
EXPOSE 9876
|
||||
|
||||
CMD ["node", "dist/index.js"]
|
||||
USER node
|
||||
|
||||
CMD ["node", "dist/main.js"]
|
||||
|
||||
@ -21,12 +21,12 @@ OpenAI 互換の LLM エンドポイント([Ollama](https://ollama.com/) / vLL
|
||||
### Docker(最短)
|
||||
|
||||
```bash
|
||||
cp .env.example .env # OLLAMA_BASE_URL などを設定
|
||||
cp .env.example .env # OLLAMA_BASE_URL / OLLAMA_MODEL を設定
|
||||
docker compose up -d
|
||||
# http://localhost:9876 を開く
|
||||
```
|
||||
|
||||
LLM エンドポイントは `.env` / `config.yaml` で指定する(既定は `http://localhost:11434/v1`)。
|
||||
Compose は安全のため `127.0.0.1:9876` のみに公開する。別ホストからアクセス可能にする前に OAuth 認証を設定し、TLS 対応のリバースプロキシを配置すること。LLM エンドポイントは `.env` / `config.yaml` で指定する。
|
||||
|
||||
### ソースから
|
||||
|
||||
@ -55,6 +55,11 @@ scripts/server.sh start # http://localhost:9876
|
||||
- **[docs/tools/](docs/tools/)** — 各ツールの詳細
|
||||
- **[docs/operations/bash-sandbox-provisioning.md](docs/operations/bash-sandbox-provisioning.md)** — 本番でのサンドボックス有効化手順
|
||||
- **[AGENTS.md](AGENTS.md)** / **[CONTRIBUTING.md](CONTRIBUTING.md)** — コントリビュータ向け
|
||||
- **[SECURITY.md](SECURITY.md)** — セキュリティ方針・脆弱性報告
|
||||
|
||||
## セキュリティ
|
||||
|
||||
既定では認証なしで動作するため、信頼できないネットワークへ直接公開しないこと。複数ユーザーまたは外部公開環境では OAuth 認証、`safety.bash_sandbox: always`、TLS リバースプロキシを有効にする。詳細は [SECURITY.md](SECURITY.md) を参照。
|
||||
|
||||
## サーバー管理
|
||||
|
||||
|
||||
26
SECURITY.md
Normal file
26
SECURITY.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Security Policy
|
||||
|
||||
## Supported Versions
|
||||
|
||||
Security fixes are applied to the latest release and the `main` branch.
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Do not open a public issue for an undisclosed vulnerability. Use the
|
||||
repository host's private security-reporting feature when available, or contact
|
||||
the repository owner privately. Include affected versions, impact, reproduction
|
||||
steps, and any suggested mitigation. Maintainers should acknowledge a report
|
||||
within seven days and coordinate disclosure after a fix is available.
|
||||
|
||||
## Deployment Baseline
|
||||
|
||||
MAESTRO can execute tools, browser actions, and optionally SSH commands. Treat
|
||||
it as a privileged service:
|
||||
|
||||
- Keep the service bound to localhost until OAuth authentication is configured.
|
||||
- Put internet-facing deployments behind a TLS reverse proxy.
|
||||
- Set `safety.bash_sandbox: always` for multi-user deployments.
|
||||
- Keep `MCP_ENCRYPTION_KEY`, OAuth secrets, SSH keys, and provider credentials
|
||||
outside the repository and rotate them after suspected exposure.
|
||||
- Restrict `/metrics` with a bearer token or an explicit source-IP allowlist.
|
||||
- Review enabled tools and integrations before granting access to untrusted users.
|
||||
@ -7,7 +7,10 @@ services:
|
||||
container_name: maestro
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9876:9876"
|
||||
# Auth is optional, so keep the default deployment local-only.
|
||||
- "127.0.0.1:9876:9876"
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
@ -20,7 +23,7 @@ services:
|
||||
- maestro-data:/data
|
||||
# エージェントワークスペース永続化
|
||||
- maestro-workspaces:/workspaces
|
||||
# 設定ファイル (任意でホストからマウント)
|
||||
# 設定ファイル (任意でホストから read-only マウント)
|
||||
# - ./config.yaml:/app/config.yaml:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "node", "-e", "fetch('http://localhost:9876/health').then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))"]
|
||||
|
||||
@ -56,12 +56,12 @@ scripts/server.sh stop
|
||||
## 5. Docker で起動
|
||||
|
||||
```bash
|
||||
cp .env.example .env # OLLAMA_BASE_URL 等を設定
|
||||
cp .env.example .env # OLLAMA_BASE_URL / OLLAMA_MODEL を設定
|
||||
docker compose up -d
|
||||
# http://localhost:9876
|
||||
```
|
||||
|
||||
DB とワークスペースは named volume(`maestro-data` / `maestro-workspaces`)に永続化される。`config.yaml` をホストからマウントする場合は `docker-compose.yml` のコメントを参照。
|
||||
DB とワークスペースは named volume(`maestro-data` / `maestro-workspaces`)に永続化される。Compose は既定で `127.0.0.1:9876` のみに公開する。`config.yaml` をホストからマウントする場合は `docker-compose.yml` のコメントを参照。
|
||||
|
||||
## 6. 最初のタスク
|
||||
|
||||
@ -75,6 +75,9 @@ DB とワークスペースは named volume(`maestro-data` / `maestro-workspac
|
||||
`auth` セクションを設定する(クライアント ID/シークレット/コールバック URL)。詳細は
|
||||
[configuration.md の auth セクション](configuration.md#auth) を参照。
|
||||
|
||||
認証を有効にするまでは信頼できないネットワークへ公開しないこと。外部公開時は TLS
|
||||
対応のリバースプロキシも使用する。運用上の注意は [../SECURITY.md](../SECURITY.md) を参照。
|
||||
|
||||
## 8. Bash サンドボックスを有効にする(任意・マルチユーザー推奨)
|
||||
|
||||
エージェントの Bash 実行をタスク単位で隔離する。本番では:
|
||||
|
||||
2075
package-lock.json
generated
2075
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
12
package.json
12
package.json
@ -1,9 +1,14 @@
|
||||
{
|
||||
"name": "maestro",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"license": "Apache-2.0",
|
||||
"type": "module",
|
||||
"description": "MAESTRO — Multi-Agent Execution System for Task Routing & Orchestration. A local-first control plane for coordinating AI agents, workers, tools, and gateways.",
|
||||
"main": "dist/main.js",
|
||||
"engines": {
|
||||
"node": ">=22"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "npm run build:server",
|
||||
"build:server": "bash scripts/generate-version.sh && tsc && npm run copy:assets",
|
||||
@ -72,6 +77,11 @@
|
||||
"supertest": "^7.2.2",
|
||||
"tsx": "^4.21.0",
|
||||
"typescript": "^5.4.3",
|
||||
"vitest": "^1.4.0"
|
||||
"vitest": "^4.1.8"
|
||||
},
|
||||
"overrides": {
|
||||
"exceljs": {
|
||||
"uuid": "11.1.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -32,6 +32,17 @@ do_start() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Surface stray maestro processes the pidfile doesn't track (e.g. an orphaned
|
||||
# run on another port/mode that a previous restart never stopped). The
|
||||
# app-level worker lock (src/instance-lock.ts) will refuse to double-start
|
||||
# worker mode against the same DB, but warn here so the operator notices.
|
||||
local strays
|
||||
strays=$(pgrep -f "node dist/main.js" 2>/dev/null | grep -vx "$$" || true)
|
||||
if [[ -n "$strays" ]]; then
|
||||
echo "WARNING: other 'node dist/main.js' process(es) detected (PID: $(echo "$strays" | tr '\n' ' '))."
|
||||
echo " If a previous run was not stopped cleanly, stop it first to avoid port/DB conflicts."
|
||||
fi
|
||||
|
||||
mkdir -p "$(dirname "$LOG_FILE")"
|
||||
|
||||
echo "Checking runtime dependencies..."
|
||||
|
||||
@ -587,3 +587,31 @@ describe('reflection config section', () => {
|
||||
expect(cfg.reflection.snapshotRetentionDays).toBe(90);
|
||||
});
|
||||
});
|
||||
|
||||
describe('loadConfig file cache', () => {
|
||||
const dirs: string[] = [];
|
||||
afterEach(() => {
|
||||
for (const d of dirs) rmSync(d, { recursive: true, force: true });
|
||||
dirs.length = 0;
|
||||
});
|
||||
function tmpCfg(content: string): string {
|
||||
const d = mkdtempSync(join(tmpdir(), 'maestro-cfg-'));
|
||||
dirs.push(d);
|
||||
const p = join(d, 'config.yaml');
|
||||
writeFileSync(p, content);
|
||||
return p;
|
||||
}
|
||||
|
||||
it('returns the same value on a repeated unchanged load (cache hit)', () => {
|
||||
const p = tmpCfg('concurrency: 9\n');
|
||||
expect(loadConfig(p).concurrency).toBe(9);
|
||||
expect(loadConfig(p).concurrency).toBe(9);
|
||||
});
|
||||
|
||||
it('reflects a changed file (cache invalidated by mtime/size)', () => {
|
||||
const p = tmpCfg('concurrency: 7\n');
|
||||
expect(loadConfig(p).concurrency).toBe(7);
|
||||
writeFileSync(p, 'concurrency: 11\nmax_iterations: 5\n');
|
||||
expect(loadConfig(p).concurrency).toBe(11);
|
||||
});
|
||||
});
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { readFileSync, existsSync } from 'fs';
|
||||
import { readFileSync, existsSync, statSync } from 'fs';
|
||||
import { parse as parseYaml } from 'yaml';
|
||||
import { logger } from './logger.js';
|
||||
import { normalizeConfig } from './config-normalize.js';
|
||||
@ -648,15 +648,46 @@ function normalizeWorkerDefs(workers: WorkerDef[]): WorkerDef[] {
|
||||
});
|
||||
}
|
||||
|
||||
// loadConfig() is called from many hot paths (agent-loop, worker poll, tool
|
||||
// execution) — historically several times per second. Re-reading + re-parsing
|
||||
// the YAML and emitting an INFO log on every call produced a reload "storm"
|
||||
// (observed: 7572 "Config: loaded" lines/day, multi-100MB server.log). Cache
|
||||
// the parsed file keyed by (path, mtime, size): on an unchanged file we skip
|
||||
// the read/parse and the log entirely. The cheap, pure merge/normalize below
|
||||
// still runs each call, so callers always get a fresh, independent object.
|
||||
let configFileCache:
|
||||
| { path: string; mtimeMs: number; size: number; fileConfig: Partial<AppConfig> }
|
||||
| null = null;
|
||||
|
||||
export function loadConfig(configPath: string = 'config.yaml'): AppConfig {
|
||||
let fileConfig: Partial<AppConfig> = {};
|
||||
let freshLoad = false;
|
||||
|
||||
if (existsSync(configPath)) {
|
||||
try {
|
||||
const st = statSync(configPath);
|
||||
if (
|
||||
configFileCache &&
|
||||
configFileCache.path === configPath &&
|
||||
configFileCache.mtimeMs === st.mtimeMs &&
|
||||
configFileCache.size === st.size
|
||||
) {
|
||||
// Cache hit — file unchanged since last parse. Clone so callers can
|
||||
// never mutate the cached copy (deepMerge shares array refs).
|
||||
fileConfig = structuredClone(configFileCache.fileConfig);
|
||||
} else {
|
||||
const raw = readFileSync(configPath, 'utf-8');
|
||||
const parsed = parseYaml(raw) as unknown;
|
||||
fileConfig = transformKeys(parsed) as Partial<AppConfig>;
|
||||
configFileCache = {
|
||||
path: configPath,
|
||||
mtimeMs: st.mtimeMs,
|
||||
size: st.size,
|
||||
fileConfig: structuredClone(fileConfig),
|
||||
};
|
||||
freshLoad = true;
|
||||
logger.info(`Config: loaded from ${configPath}`);
|
||||
}
|
||||
} catch (err) {
|
||||
logger.warn(`Config: failed to parse ${configPath}: ${err}`);
|
||||
}
|
||||
@ -716,7 +747,11 @@ export function loadConfig(configPath: string = 'config.yaml'): AppConfig {
|
||||
|
||||
config.provider.workers = normalizeWorkerDefs(config.provider.workers);
|
||||
|
||||
// Only log on a fresh (re)load — see configFileCache note above. Logging on
|
||||
// every cache hit was the other half of the reload storm.
|
||||
if (freshLoad) {
|
||||
logger.info(`Config: workers=${config.provider.workers.map(w => w.id).join(',')}`);
|
||||
}
|
||||
|
||||
// v2 normalization (Step 1 of 2026-05-21 Settings UI / Config restructure).
|
||||
//
|
||||
|
||||
@ -574,7 +574,7 @@ describe('executeMovement parallel tool execution', () => {
|
||||
expect(markerMessages.length).toBeGreaterThanOrEqual(1);
|
||||
});
|
||||
|
||||
it('force-transitions to defaultNext when initial prompt is oversized and defaultNext is set', async () => {
|
||||
it('aborts when initial prompt is oversized and defaultNext is terminal', async () => {
|
||||
const { ContextManager } = await import('./context-manager.js');
|
||||
const cm = new ContextManager({ limitTokens: 1_000 });
|
||||
|
||||
@ -594,10 +594,10 @@ describe('executeMovement parallel tool execution', () => {
|
||||
{ contextManager: cm },
|
||||
);
|
||||
|
||||
// makeMovement defaultNext is 'COMPLETE' so we force-transition there
|
||||
expect(result.next).toBe('COMPLETE');
|
||||
expect(result.output).toContain('Context overflow');
|
||||
expect(result.lessons).toContain('Context overflow');
|
||||
// A terminal fallback would report a false completion, so overflow aborts.
|
||||
expect(result.next).toBe('ABORT');
|
||||
expect(result.abortCode).toBe('context_overflow');
|
||||
expect(result.output).toContain('LLM request blocked before send');
|
||||
// Only the isolated summary call should have happened — no main LLM call
|
||||
expect(executeToolMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import { execFileSync } from 'child_process';
|
||||
import { tmpdir } from 'os';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'fs';
|
||||
import { existsSync as existsSyncEvents, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import type { MovementResult } from './agent-loop.js';
|
||||
import type { PieceDef } from './piece-runner.js';
|
||||
@ -521,15 +521,9 @@ describe('buildFollowupNotice (option C)', () => {
|
||||
// Traceability T-2 — handoff / delta / followup / context_action
|
||||
// ============================================================
|
||||
|
||||
import { runPiece } from './piece-runner.js';
|
||||
import { readFileSync } from 'fs';
|
||||
import { createFileEventLogger, parseEventLine, type EventBase } from '../progress/event-log.js';
|
||||
import type { OpenAICompatClient, LLMEvent } from '../llm/openai-compat.js';
|
||||
|
||||
vi.mock('./agent-loop.js', () => ({
|
||||
executeMovement: vi.fn(),
|
||||
}));
|
||||
|
||||
function readAllEvents(workspacePath: string): EventBase[] {
|
||||
const path = join(workspacePath, 'logs', 'events.jsonl');
|
||||
if (!existsSyncEvents(path)) return [];
|
||||
@ -541,7 +535,6 @@ function readAllEvents(workspacePath: string): EventBase[] {
|
||||
});
|
||||
}
|
||||
|
||||
import { existsSync as existsSyncEvents } from 'fs';
|
||||
|
||||
describe('Traceability T-2: piece-runner emission for subtask boundary + followup', () => {
|
||||
let workspace: string;
|
||||
|
||||
@ -5,6 +5,14 @@ import { mkdtempSync, rmSync, mkdirSync, writeFileSync, readFileSync, existsSync
|
||||
import { tmpdir } from 'os';
|
||||
import { join } from 'path';
|
||||
import matter from 'gray-matter';
|
||||
// This mock is intentionally file-wide. vi.mock is hoisted, so keeping it at
|
||||
// top level makes the actual test behavior explicit and Vitest 4 compatible.
|
||||
vi.mock('child_process', () => ({
|
||||
execSync: () => {
|
||||
throw new Error('git: command not found');
|
||||
},
|
||||
}));
|
||||
|
||||
|
||||
// ---- helpers ----------------------------------------------------------------
|
||||
|
||||
@ -134,21 +142,8 @@ describe('silentFork', () => {
|
||||
});
|
||||
});
|
||||
|
||||
// ---- git-unavailable suite (isolated via vi.mock) ---------------------------
|
||||
//
|
||||
// vi.mock hoisting means this mock is set up before any import of the module
|
||||
// under test, making execSync throw unconditionally for this describe block.
|
||||
|
||||
// ---- git-unavailable suite --------------------------------------------------
|
||||
describe('silentFork — git unavailable', () => {
|
||||
// Hoist the mock so it applies before the module is evaluated.
|
||||
vi.mock('child_process', () => {
|
||||
return {
|
||||
execSync: () => {
|
||||
throw new Error('git: command not found');
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
let dir: string;
|
||||
let builtinDir: string;
|
||||
let dataDir: string;
|
||||
|
||||
@ -143,6 +143,13 @@ function isBlockedDocsSubpath(relFromDocs: string): boolean {
|
||||
});
|
||||
}
|
||||
|
||||
function sanitizeListedDescription(description: string): string {
|
||||
const referencesInternalDoc = BLOCKED_DOCS_SUBPATHS.some((blocked) =>
|
||||
description.includes(blocked.replace(/\/$/, '')),
|
||||
);
|
||||
return referencesInternalDoc ? '(description omitted: internal reference)' : description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Map a symbolic doc name to a concrete file path under an allow-listed root.
|
||||
* Returns null on invalid names, attempted traversal, or blocked internal docs.
|
||||
@ -306,7 +313,7 @@ function extractMarkdownDescription(filePath: string): string {
|
||||
}
|
||||
if (line.startsWith('#')) continue;
|
||||
// Use this line as description
|
||||
return line.slice(0, 140);
|
||||
return sanitizeListedDescription(line.slice(0, 140));
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
@ -326,7 +333,7 @@ function extractPieceDescription(filePath: string): string {
|
||||
.split('\n')
|
||||
.map((s) => s.trim())
|
||||
.find((s) => s.length > 0);
|
||||
return (first ?? '(no description)').slice(0, 140);
|
||||
return sanitizeListedDescription((first ?? '(no description)').slice(0, 140));
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import * as dns from 'dns';
|
||||
import { isIP } from 'node:net';
|
||||
import { isPrivateOrForbidden } from '../../../net/ssrf-strict.js';
|
||||
|
||||
// These delegate to the hardened range check in src/net/ssrf-strict.ts so that
|
||||
@ -6,11 +7,13 @@ import { isPrivateOrForbidden } from '../../../net/ssrf-strict.js';
|
||||
// loopback, RFC1918, link-local + cloud metadata (169.254/16, fd00:ec2::),
|
||||
// CGNAT (100.64/10), 0.0.0.0/8, IPv4-mapped IPv6, NAT64, multicast, reserved.
|
||||
export function isPrivateIPv4(ip: string): boolean {
|
||||
if (isIP(ip) !== 4) return false;
|
||||
return isPrivateOrForbidden(ip, 4);
|
||||
}
|
||||
|
||||
export function isPrivateIPv6(ip: string): boolean {
|
||||
const normalized = ip.toLowerCase().replace(/^\[|\]$/g, '');
|
||||
if (isIP(normalized) !== 6) return false;
|
||||
return isPrivateOrForbidden(normalized, 6);
|
||||
}
|
||||
|
||||
|
||||
59
src/instance-lock.test.ts
Normal file
59
src/instance-lock.test.ts
Normal file
@ -0,0 +1,59 @@
|
||||
import { describe, it, expect, afterEach } from 'vitest';
|
||||
import { mkdtempSync, writeFileSync, existsSync, rmSync } from 'fs';
|
||||
import { tmpdir, hostname } from 'os';
|
||||
import { join } from 'path';
|
||||
import { acquireWorkerLock } from './instance-lock.js';
|
||||
|
||||
const dirs: string[] = [];
|
||||
function tmpDb(): string {
|
||||
const d = mkdtempSync(join(tmpdir(), 'maestro-lock-'));
|
||||
dirs.push(d);
|
||||
return join(d, 'maestro.db');
|
||||
}
|
||||
|
||||
afterEach(() => {
|
||||
for (const d of dirs) rmSync(d, { recursive: true, force: true });
|
||||
dirs.length = 0;
|
||||
delete process.env['MAESTRO_FORCE_WORKER_LOCK'];
|
||||
});
|
||||
|
||||
describe('acquireWorkerLock', () => {
|
||||
it('writes a lock file and release() removes it', () => {
|
||||
const db = tmpDb();
|
||||
const lock = acquireWorkerLock(db);
|
||||
expect(existsSync(lock.path)).toBe(true);
|
||||
lock.release();
|
||||
expect(existsSync(lock.path)).toBe(false);
|
||||
lock.release(); // idempotent
|
||||
});
|
||||
|
||||
it('throws when a live process on this host already holds the lock', () => {
|
||||
const db = tmpDb();
|
||||
// process.ppid is alive (it spawned us) and is not our pid.
|
||||
writeFileSync(`${db}.worker.lock`, JSON.stringify({
|
||||
pid: process.ppid, host: hostname(), startedAt: new Date().toISOString(),
|
||||
}));
|
||||
expect(() => acquireWorkerLock(db)).toThrow(/already running/);
|
||||
});
|
||||
|
||||
it('reclaims a stale lock from a dead pid', () => {
|
||||
const db = tmpDb();
|
||||
writeFileSync(`${db}.worker.lock`, JSON.stringify({
|
||||
pid: 2147483646, host: hostname(), startedAt: new Date().toISOString(),
|
||||
}));
|
||||
const lock = acquireWorkerLock(db); // must not throw
|
||||
expect(existsSync(lock.path)).toBe(true);
|
||||
lock.release();
|
||||
});
|
||||
|
||||
it('MAESTRO_FORCE_WORKER_LOCK=1 overrides a live lock', () => {
|
||||
const db = tmpDb();
|
||||
writeFileSync(`${db}.worker.lock`, JSON.stringify({
|
||||
pid: process.ppid, host: hostname(), startedAt: new Date().toISOString(),
|
||||
}));
|
||||
process.env['MAESTRO_FORCE_WORKER_LOCK'] = '1';
|
||||
const lock = acquireWorkerLock(db);
|
||||
expect(existsSync(lock.path)).toBe(true);
|
||||
lock.release();
|
||||
});
|
||||
});
|
||||
102
src/instance-lock.ts
Normal file
102
src/instance-lock.ts
Normal file
@ -0,0 +1,102 @@
|
||||
/**
|
||||
* instance-lock.ts — single-writer guard for worker mode.
|
||||
*
|
||||
* Multiple maestro worker processes pointed at the same SQLite DB each run
|
||||
* their own WorkerManager + Scheduler. They share the `jobs` table and the
|
||||
* `config.yaml` file, so they fight over job claims, issue locks, and config
|
||||
* reloads — which in production caused jobs to be picked up and executed
|
||||
* twice. (An orphaned second process on a different port survived a restart
|
||||
* because nothing stopped it.)
|
||||
*
|
||||
* This lock makes a second worker process on the same host refuse to start
|
||||
* instead of silently double-scheduling. It is intentionally lightweight: a
|
||||
* JSON lock file next to the DB holding the owner pid + host. Liveness is
|
||||
* checked with `process.kill(pid, 0)` (same host only). A wedged lock can be
|
||||
* overridden with MAESTRO_FORCE_WORKER_LOCK=1.
|
||||
*/
|
||||
import { readFileSync, writeFileSync, existsSync, unlinkSync } from 'fs';
|
||||
import { hostname } from 'os';
|
||||
import { logger } from './logger.js';
|
||||
|
||||
export interface WorkerLock {
|
||||
/** Remove the lock file if we still own it. Idempotent. */
|
||||
release(): void;
|
||||
/** Absolute/relative path of the lock file (for diagnostics + tests). */
|
||||
readonly path: string;
|
||||
}
|
||||
|
||||
interface LockRecord {
|
||||
pid: number;
|
||||
host: string;
|
||||
startedAt: string;
|
||||
}
|
||||
|
||||
function isAlive(pid: number): boolean {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch (e) {
|
||||
// ESRCH → no such process (dead). EPERM → exists but not signalable by us (alive).
|
||||
return (e as NodeJS.ErrnoException)?.code === 'EPERM';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Acquire the worker lock for `dbPath`. Throws if another live worker process
|
||||
* on this host already holds it. Returns a handle whose `release()` removes the
|
||||
* lock — wire it into the shutdown sequence.
|
||||
*/
|
||||
export function acquireWorkerLock(dbPath: string): WorkerLock {
|
||||
const lockPath = `${dbPath}.worker.lock`;
|
||||
const host = hostname();
|
||||
const force = process.env['MAESTRO_FORCE_WORKER_LOCK'] === '1';
|
||||
|
||||
if (existsSync(lockPath) && !force) {
|
||||
try {
|
||||
const prev = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord;
|
||||
if (prev.host === host && prev.pid !== process.pid && isAlive(prev.pid)) {
|
||||
throw new Error(
|
||||
`Another maestro worker process is already running ` +
|
||||
`(pid=${prev.pid}, host=${host}, since ${prev.startedAt}). ` +
|
||||
`Lock file: ${lockPath}. Stop that process first, or set ` +
|
||||
`MAESTRO_FORCE_WORKER_LOCK=1 to override a wedged lock.`,
|
||||
);
|
||||
}
|
||||
if (prev.host !== host) {
|
||||
logger.warn(
|
||||
`[instance-lock] lock held by a different host (${prev.host}, pid=${prev.pid}); ` +
|
||||
`cannot verify liveness. Proceeding, but multiple worker hosts against one ` +
|
||||
`SQLite DB is unsupported and unsafe.`,
|
||||
);
|
||||
} else {
|
||||
logger.warn(`[instance-lock] reclaiming stale lock from dead pid=${prev.pid}`);
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof Error && err.message.startsWith('Another maestro worker process')) {
|
||||
throw err;
|
||||
}
|
||||
logger.warn(`[instance-lock] unreadable lock file ${lockPath}, reclaiming: ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
const record: LockRecord = { pid: process.pid, host, startedAt: new Date().toISOString() };
|
||||
writeFileSync(lockPath, JSON.stringify(record), 'utf-8');
|
||||
logger.info(`[instance-lock] acquired worker lock ${lockPath} (pid=${process.pid})`);
|
||||
|
||||
let released = false;
|
||||
return {
|
||||
path: lockPath,
|
||||
release(): void {
|
||||
if (released) return;
|
||||
released = true;
|
||||
try {
|
||||
if (existsSync(lockPath)) {
|
||||
const cur = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord;
|
||||
if (cur.pid === process.pid) unlinkSync(lockPath);
|
||||
}
|
||||
} catch {
|
||||
/* best effort — never block shutdown on lock cleanup */
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -27,6 +27,7 @@ import { Scheduler } from './scheduler.js';
|
||||
import { buildTitleFallback, buildTitlePrompt } from './title-generation.js';
|
||||
import { initMcpKeyFromFile } from './mcp/crypto.js';
|
||||
import { registerShutdownHook, installSignalHandlers } from './bridge/shutdown.js';
|
||||
import { acquireWorkerLock } from './instance-lock.js';
|
||||
import { checkBwrapAvailable } from './engine/tools/sandbox.js';
|
||||
import { SkillCatalog } from './engine/skills.js';
|
||||
import { VapidKeyStore } from './vapid-store.js';
|
||||
@ -78,6 +79,12 @@ export async function start(opts: StartWorkerOptions = {}): Promise<void> {
|
||||
const dbPath = process.env['DB_PATH'] ?? './data/maestro.db';
|
||||
runPreflight(configPath, dbPath);
|
||||
|
||||
// Single-writer guard: refuse to start if another worker process on this
|
||||
// host already owns this DB. Prevents two schedulers from double-claiming
|
||||
// and double-executing jobs (see instance-lock.ts).
|
||||
const workerLock = acquireWorkerLock(dbPath);
|
||||
registerShutdownHook('worker-instance-lock', async () => workerLock.release());
|
||||
|
||||
const requiresBwrap =
|
||||
config.safety?.bashUnrestricted || config.safety?.bashSandbox === 'always';
|
||||
if (requiresBwrap) {
|
||||
|
||||
@ -1,8 +1,171 @@
|
||||
// src/worker-manager.test.ts
|
||||
import { describe, it, expect, vi } from 'vitest';
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import type { AppConfig, WorkerDef } from './config.js';
|
||||
|
||||
describe('WorkerManager', () => {
|
||||
it('placeholder for integration tests after index.ts refactor', () => {
|
||||
expect(true).toBe(true);
|
||||
// Capture every MockWorker the manager constructs so tests can inspect
|
||||
// lifecycle (started / stopped / config hot-swaps / in-flight count).
|
||||
const instances: MockWorker[] = [];
|
||||
|
||||
class MockWorker {
|
||||
started = false;
|
||||
stopped = false;
|
||||
configUpdates = 0;
|
||||
inflight = 0;
|
||||
config: AppConfig;
|
||||
constructor(
|
||||
public id: string,
|
||||
public endpoint: string,
|
||||
public model: string | undefined,
|
||||
public repo: unknown,
|
||||
config: AppConfig,
|
||||
) {
|
||||
this.config = config;
|
||||
instances.push(this);
|
||||
}
|
||||
start(): void { this.started = true; }
|
||||
stop(): void { this.stopped = true; }
|
||||
updateConfig(c: AppConfig): void { this.config = c; this.configUpdates++; }
|
||||
get inflightCount(): number { return this.inflight; }
|
||||
async waitForCompletion(): Promise<boolean> { return this.inflight === 0; }
|
||||
setMcpTokenManager(): void {}
|
||||
setWorkerMetrics(): void {}
|
||||
setSkillCatalog(): void {}
|
||||
setPushService(): void {}
|
||||
}
|
||||
|
||||
vi.mock('./worker.js', () => ({ Worker: MockWorker }));
|
||||
|
||||
// Imported after the mock is registered.
|
||||
const { WorkerManager } = await import('./worker-manager.js');
|
||||
|
||||
function def(id: string, extra: Partial<WorkerDef> = {}): WorkerDef {
|
||||
return { id, endpoint: `http://${id}:8080/v1`, roles: ['auto'], maxConcurrency: 1, ...extra } as WorkerDef;
|
||||
}
|
||||
|
||||
function cfgWith(workers: WorkerDef[], extra: Record<string, unknown> = {}): AppConfig {
|
||||
return { provider: { workers }, ...extra } as unknown as AppConfig;
|
||||
}
|
||||
|
||||
function makeManager(initial: AppConfig) {
|
||||
const requeueRunningJobs = vi.fn();
|
||||
const repo = { requeueRunningJobs } as any;
|
||||
let current = initial;
|
||||
const configManager = {
|
||||
getConfig: () => current,
|
||||
onConfigChanged: () => {},
|
||||
setConfig: (c: AppConfig) => { current = c; },
|
||||
} as any;
|
||||
const wm = new WorkerManager(repo, configManager);
|
||||
return { wm, requeueRunningJobs, configManager };
|
||||
}
|
||||
|
||||
beforeEach(() => { instances.length = 0; });
|
||||
|
||||
describe('WorkerManager differential rebuild', () => {
|
||||
it('keeps a busy worker alive (no requeue) when its def is unchanged', async () => {
|
||||
const cfg1 = cfgWith([def('A'), def('B')], { safety: { maxIterations: 200 } });
|
||||
const { wm, requeueRunningJobs } = makeManager(cfg1);
|
||||
wm.start();
|
||||
|
||||
const [a, b] = wm.getWorkers() as unknown as MockWorker[];
|
||||
a.inflight = 1; // A is mid-job
|
||||
|
||||
// Only a *global* field changed; worker defs are byte-identical.
|
||||
const cfg2 = cfgWith(
|
||||
[structuredClone(def('A')), structuredClone(def('B'))],
|
||||
{ safety: { maxIterations: 50 } },
|
||||
);
|
||||
await wm.rebuild(cfg2);
|
||||
|
||||
// The busy worker must NOT be torn down and its job must NOT be requeued.
|
||||
expect(requeueRunningJobs).not.toHaveBeenCalled();
|
||||
expect(a.stopped).toBe(false);
|
||||
expect(b.stopped).toBe(false);
|
||||
expect(a.configUpdates).toBe(1); // config hot-swapped in place
|
||||
expect(a.config).toBe(cfg2);
|
||||
// Same two instances kept — no fresh workers created.
|
||||
expect(wm.getWorkers()).toEqual([a, b]);
|
||||
expect(instances).toHaveLength(2);
|
||||
});
|
||||
|
||||
it('retires a changed-def worker without requeue; finishing its in-flight job', async () => {
|
||||
const cfg1 = cfgWith([def('A'), def('B', { maxConcurrency: 4 })]);
|
||||
const { wm, requeueRunningJobs } = makeManager(cfg1);
|
||||
wm.start();
|
||||
const [, b] = wm.getWorkers() as unknown as MockWorker[];
|
||||
b.inflight = 1; // B busy
|
||||
|
||||
// B's def changes (concurrency 4 -> 2); A unchanged.
|
||||
const cfg2 = cfgWith([structuredClone(def('A')), def('B', { maxConcurrency: 2 })]);
|
||||
await wm.rebuild(cfg2);
|
||||
|
||||
// Rebuild NEVER requeues — that was the double-execution bug.
|
||||
expect(requeueRunningJobs).not.toHaveBeenCalled();
|
||||
// Old B stopped polling (retiring) but is not in the active set...
|
||||
expect(b.stopped).toBe(true);
|
||||
const activeIds = (wm.getWorkers() as unknown as MockWorker[]).map(w => w.id);
|
||||
expect(activeIds.sort()).toEqual(['A', 'B']);
|
||||
// ...and the active B is a fresh instance (3rd constructed), started.
|
||||
const freshB = (wm.getWorkers() as unknown as MockWorker[]).find(w => w.id === 'B')!;
|
||||
expect(freshB).not.toBe(b);
|
||||
expect(freshB.started).toBe(true);
|
||||
});
|
||||
|
||||
it('removing a worker retires it without requeue', async () => {
|
||||
const cfg1 = cfgWith([def('A'), def('B')]);
|
||||
const { wm, requeueRunningJobs } = makeManager(cfg1);
|
||||
wm.start();
|
||||
const [, b] = wm.getWorkers() as unknown as MockWorker[];
|
||||
b.inflight = 1;
|
||||
|
||||
const cfg2 = cfgWith([structuredClone(def('A'))]); // B removed
|
||||
await wm.rebuild(cfg2);
|
||||
|
||||
expect(requeueRunningJobs).not.toHaveBeenCalled();
|
||||
expect(b.stopped).toBe(true);
|
||||
expect((wm.getWorkers() as unknown as MockWorker[]).map(w => w.id)).toEqual(['A']);
|
||||
});
|
||||
|
||||
it('prunes a retired worker once its in-flight job finishes (on next rebuild)', async () => {
|
||||
const cfg1 = cfgWith([def('A', { maxConcurrency: 4 })]);
|
||||
const { wm } = makeManager(cfg1);
|
||||
wm.start();
|
||||
const [a] = wm.getWorkers() as unknown as MockWorker[];
|
||||
a.inflight = 1;
|
||||
|
||||
// Change A's def -> A retired (still draining, inflight=1).
|
||||
const cfg2 = cfgWith([def('A', { maxConcurrency: 2 })]);
|
||||
await wm.rebuild(cfg2);
|
||||
expect(a.stopped).toBe(true);
|
||||
|
||||
// A's job finishes; a no-op rebuild should prune the retired instance.
|
||||
a.inflight = 0;
|
||||
const cfg3 = cfgWith([structuredClone(def('A', { maxConcurrency: 2 }))]);
|
||||
await wm.rebuild(cfg3);
|
||||
// Active set has exactly one A; the drained retired instance is gone.
|
||||
expect((wm.getWorkers() as unknown as MockWorker[]).filter(w => w.id === 'A')).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe('WorkerManager shutdown', () => {
|
||||
it('requeues running jobs only on shutdown when a worker fails to drain', async () => {
|
||||
const cfg = cfgWith([def('A')]);
|
||||
const { wm, requeueRunningJobs } = makeManager(cfg);
|
||||
wm.start();
|
||||
const [a] = wm.getWorkers() as unknown as MockWorker[];
|
||||
a.inflight = 1; // never drains -> waitForCompletion returns false
|
||||
|
||||
const res = await wm.stop(0);
|
||||
expect(res.requeued).toContain('A');
|
||||
expect(requeueRunningJobs).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('does not requeue on a clean shutdown', async () => {
|
||||
const cfg = cfgWith([def('A')]);
|
||||
const { wm, requeueRunningJobs } = makeManager(cfg);
|
||||
wm.start();
|
||||
const res = await wm.stop(0);
|
||||
expect(res.drained).toContain('A');
|
||||
expect(requeueRunningJobs).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
import { Worker } from './worker.js';
|
||||
import { Repository } from './db/repository.js';
|
||||
import { ConfigManager } from './config-manager.js';
|
||||
import { type AppConfig } from './config.js';
|
||||
import { type AppConfig, type WorkerDef } from './config.js';
|
||||
import { logger } from './logger.js';
|
||||
import type { McpTokenManager } from './mcp/token-manager.js';
|
||||
import type { WorkerMetrics } from './metrics/worker-metrics.js';
|
||||
@ -11,6 +11,16 @@ import type { PushService } from './push-service.js';
|
||||
|
||||
export class WorkerManager {
|
||||
private workers: Worker[] = [];
|
||||
/**
|
||||
* Workers removed by a differential rebuild that still have an in-flight
|
||||
* job. They have stopped polling (claim no new work) but their detached
|
||||
* agent loop is allowed to finish — we deliberately do NOT requeue the job,
|
||||
* which is what previously caused the same job to run twice. Pruned on the
|
||||
* next rebuild and drained on shutdown.
|
||||
*/
|
||||
private retiring: Worker[] = [];
|
||||
/** Config the current `workers` were built from — used to diff worker defs. */
|
||||
private lastConfig: AppConfig | null = null;
|
||||
private repo: Repository;
|
||||
private configManager: ConfigManager;
|
||||
private rebuildPromise: Promise<any> | null = null;
|
||||
@ -37,43 +47,55 @@ export class WorkerManager {
|
||||
start(): void {
|
||||
const config = this.configManager.getConfig();
|
||||
this.workers = this.createWorkers(config);
|
||||
this.lastConfig = config;
|
||||
for (const w of this.workers) w.start();
|
||||
logger.info(`WorkerManager: started ${this.workers.length} workers`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Full shutdown of every worker (active + retiring). The global
|
||||
* `requeueRunningJobs()` is only valid here, on true process shutdown:
|
||||
* the process is exiting so the in-flight agent loops die with it, and
|
||||
* requeuing lets the next process pick the jobs up. It must NEVER be used
|
||||
* on a config rebuild (see doRebuild) — there the old loops keep running,
|
||||
* so requeuing would let the same job execute twice.
|
||||
*/
|
||||
async stop(timeoutMs = 30000): Promise<{ drained: string[]; requeued: string[] }> {
|
||||
const drained: string[] = [];
|
||||
const requeued: string[] = [];
|
||||
|
||||
const all = [...this.workers, ...this.retiring];
|
||||
|
||||
// Stop polling for all workers
|
||||
for (const w of this.workers) w.stop();
|
||||
for (const w of all) w.stop();
|
||||
|
||||
// Wait for completion with timeout
|
||||
const results = await Promise.all(
|
||||
this.workers.map(w => w.waitForCompletion(timeoutMs))
|
||||
all.map(w => w.waitForCompletion(timeoutMs))
|
||||
);
|
||||
|
||||
// Track drained (completed normally) and requeue (timed out)
|
||||
for (let i = 0; i < this.workers.length; i++) {
|
||||
const w = this.workers[i]!;
|
||||
for (let i = 0; i < all.length; i++) {
|
||||
const w = all[i]!;
|
||||
if (results[i]) {
|
||||
drained.push(w.id);
|
||||
} else {
|
||||
// Timed out — requeue running jobs for this worker
|
||||
// Note: Repository doesn't have a workerId filter for running jobs,
|
||||
// so we requeue all running jobs. This is acceptable because we just
|
||||
// stopped all workers.
|
||||
// so we requeue all running jobs. This is acceptable because the
|
||||
// process is shutting down and all worker loops are being torn down.
|
||||
requeued.push(w.id);
|
||||
}
|
||||
}
|
||||
|
||||
// If any workers timed out, requeue all running jobs
|
||||
// If any workers timed out, requeue all running jobs (shutdown only)
|
||||
if (requeued.length > 0) {
|
||||
this.repo.requeueRunningJobs();
|
||||
logger.info(`WorkerManager: requeued running jobs for timed-out workers: ${requeued.join(', ')}`);
|
||||
}
|
||||
|
||||
this.workers = [];
|
||||
this.retiring = [];
|
||||
return { drained, requeued };
|
||||
}
|
||||
|
||||
@ -90,13 +112,80 @@ export class WorkerManager {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Differential rebuild on config change.
|
||||
*
|
||||
* Workers whose own def is unchanged are KEPT running (config hot-swapped
|
||||
* in place) — so a change to, say, a tool size limit never restarts a busy
|
||||
* worker. Workers whose def changed (or were removed/disabled) are retired:
|
||||
* they stop polling but their in-flight agent loop finishes naturally and
|
||||
* the job is NOT requeued. The job stays `running` + issue-locked, so a
|
||||
* freshly started worker with the same id can't re-claim it.
|
||||
*
|
||||
* This closes the bug where any config edit during a long-running job
|
||||
* caused the job to be requeued under a still-running loop and executed a
|
||||
* second time concurrently.
|
||||
*/
|
||||
private async doRebuild(newConfig: AppConfig): Promise<{ drained: string[]; requeued: string[] }> {
|
||||
logger.info('WorkerManager: rebuilding workers for new config');
|
||||
const result = await this.stop();
|
||||
this.workers = this.createWorkers(newConfig);
|
||||
for (const w of this.workers) w.start();
|
||||
logger.info(`WorkerManager: rebuilt ${this.workers.length} workers`);
|
||||
return result;
|
||||
|
||||
const oldSigs = this.defSignatures(this.lastConfig ?? this.configManager.getConfig());
|
||||
const newSigs = this.defSignatures(newConfig);
|
||||
|
||||
const kept: Worker[] = [];
|
||||
const retired: Worker[] = [];
|
||||
for (const w of this.workers) {
|
||||
const newSig = newSigs.get(w.id);
|
||||
if (newSig !== undefined && newSig === oldSigs.get(w.id)) {
|
||||
w.updateConfig(newConfig); // def unchanged → keep alive, refresh global config
|
||||
kept.push(w);
|
||||
} else {
|
||||
retired.push(w); // removed, disabled, or def changed
|
||||
}
|
||||
}
|
||||
|
||||
// Retire: stop polling only. Do NOT requeue — the detached loop finishes
|
||||
// on its own and the job remains running + locked, so no double execution.
|
||||
for (const w of retired) w.stop();
|
||||
this.retiring.push(...retired);
|
||||
|
||||
// Create fresh workers for ids not already kept.
|
||||
const keptIds = new Set(kept.map(w => w.id));
|
||||
const fresh = newConfig.provider.workers
|
||||
.filter(def => def.enabled !== false && !keptIds.has(def.id))
|
||||
.map(def => this.createWorker(def, newConfig));
|
||||
for (const w of fresh) w.start();
|
||||
|
||||
this.workers = [...kept, ...fresh];
|
||||
this.lastConfig = newConfig;
|
||||
|
||||
const drained = this.pruneRetiring();
|
||||
if (this.retiring.length > 0) {
|
||||
logger.info(
|
||||
`WorkerManager: ${this.retiring.length} retired worker(s) draining in-flight jobs, no requeue: ${this.retiring.map(w => w.id).join(', ')}`
|
||||
);
|
||||
}
|
||||
logger.info(
|
||||
`WorkerManager: rebuilt kept=${kept.length} fresh=${fresh.length} retired=${retired.length} retiring=${this.retiring.length}`
|
||||
);
|
||||
return { drained, requeued: [] };
|
||||
}
|
||||
|
||||
/** Stable per-id signature of each enabled worker def, for rebuild diffing. */
|
||||
private defSignatures(config: AppConfig): Map<string, string> {
|
||||
const sigs = new Map<string, string>();
|
||||
for (const def of config.provider.workers) {
|
||||
if (def.enabled === false) continue;
|
||||
sigs.set(def.id, JSON.stringify(def));
|
||||
}
|
||||
return sigs;
|
||||
}
|
||||
|
||||
/** Drop retired workers whose in-flight job has finished; returns their ids. */
|
||||
private pruneRetiring(): string[] {
|
||||
const done = this.retiring.filter(w => w.inflightCount === 0);
|
||||
this.retiring = this.retiring.filter(w => w.inflightCount > 0);
|
||||
return done.map(w => w.id);
|
||||
}
|
||||
|
||||
getWorkers(): Worker[] {
|
||||
@ -143,35 +232,18 @@ export class WorkerManager {
|
||||
}
|
||||
|
||||
private createWorkers(config: AppConfig): Worker[] {
|
||||
const workers = config.provider.workers
|
||||
return config.provider.workers
|
||||
.filter(def => def.enabled !== false)
|
||||
.map(def => new Worker(
|
||||
def.id,
|
||||
def.endpoint,
|
||||
def.model,
|
||||
this.repo,
|
||||
config,
|
||||
));
|
||||
if (this.mcpTokenManager) {
|
||||
for (const w of workers) {
|
||||
w.setMcpTokenManager(this.mcpTokenManager);
|
||||
.map(def => this.createWorker(def, config));
|
||||
}
|
||||
}
|
||||
if (this.workerMetrics) {
|
||||
for (const w of workers) {
|
||||
w.setWorkerMetrics(this.workerMetrics);
|
||||
}
|
||||
}
|
||||
if (this.skillCatalog) {
|
||||
for (const w of workers) {
|
||||
w.setSkillCatalog(this.skillCatalog);
|
||||
}
|
||||
}
|
||||
if (this.pushService) {
|
||||
for (const w of workers) {
|
||||
w.setPushService(this.pushService);
|
||||
}
|
||||
}
|
||||
return workers;
|
||||
|
||||
/** Build a single worker and wire up the optional collaborators. */
|
||||
private createWorker(def: WorkerDef, config: AppConfig): Worker {
|
||||
const w = new Worker(def.id, def.endpoint, def.model, this.repo, config);
|
||||
if (this.mcpTokenManager) w.setMcpTokenManager(this.mcpTokenManager);
|
||||
if (this.workerMetrics) w.setWorkerMetrics(this.workerMetrics);
|
||||
if (this.skillCatalog) w.setSkillCatalog(this.skillCatalog);
|
||||
if (this.pushService) w.setPushService(this.pushService);
|
||||
return w;
|
||||
}
|
||||
}
|
||||
|
||||
@ -383,6 +383,24 @@ export class Worker {
|
||||
this.pushService = svc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hot-swap the global config on a still-running worker. Used by
|
||||
* WorkerManager's differential rebuild: when a config change does NOT
|
||||
* touch this worker's own def (e.g. a tool size limit changed), we keep
|
||||
* the worker — and any in-flight job — alive and just refresh the config
|
||||
* it reads for future jobs. Def-derived values (roles, endpoint,
|
||||
* maxConcurrency) are read live via getWorkerDef(), so they stay correct.
|
||||
* In-flight jobs keep the settings they captured at start, by design.
|
||||
*/
|
||||
public updateConfig(config: AppConfig): void {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
/** Jobs currently executing in this worker's detached loops. */
|
||||
public get inflightCount(): number {
|
||||
return this.inflight;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fire a V2 push for a job status transition. Fire-and-forget — never
|
||||
* throws and never awaits the underlying queue. Skips silently when
|
||||
|
||||
1904
ui/package-lock.json
generated
1904
ui/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -18,11 +18,11 @@
|
||||
"@xterm/addon-fit": "^0.10.0",
|
||||
"@xterm/xterm": "^5.5.0",
|
||||
"autoprefixer": "^10.4.27",
|
||||
"dompurify": "^3.3.3",
|
||||
"dompurify": "^3.4.8",
|
||||
"highlight.js": "^11.11.1",
|
||||
"marked": "^17.0.4",
|
||||
"mermaid": "^11.13.0",
|
||||
"postcss": "^8.5.8",
|
||||
"mermaid": "^11.15.0",
|
||||
"postcss": "^8.5.15",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
"tailwindcss": "^3.4.19",
|
||||
@ -31,8 +31,13 @@
|
||||
"devDependencies": {
|
||||
"@types/react": "^18.3.20",
|
||||
"@types/react-dom": "^18.3.5",
|
||||
"@vitejs/plugin-react": "^4.3.4",
|
||||
"@vitejs/plugin-react": "^6.0.2",
|
||||
"typescript": "^5.4.3",
|
||||
"vite": "^5.4.21"
|
||||
"vite": "^8.0.16"
|
||||
},
|
||||
"overrides": {
|
||||
"monaco-editor": {
|
||||
"dompurify": "3.4.8"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
12
vitest.config.ts
Normal file
12
vitest.config.ts
Normal file
@ -0,0 +1,12 @@
|
||||
import { configDefaults, defineConfig } from 'vitest/config';
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
exclude: [
|
||||
...configDefaults.exclude,
|
||||
'.claude/**',
|
||||
'.superpowers/**',
|
||||
'.worktrees/**',
|
||||
],
|
||||
},
|
||||
});
|
||||
Loading…
x
Reference in New Issue
Block a user