maestro/scripts/bench-run.ts
2026-06-03 05:08:00 +00:00

163 lines
6.1 KiB
TypeScript

#!/usr/bin/env -S npx tsx
/**
* CLI entry for the benchmark runner.
*
* npm run bench # all tasks
* npm run bench -- --task=composite-mini-report
* npm run bench -- --server=http://127.0.0.1:9876
* BENCH_JUDGE=off npm run bench # skip LLM judge (axis D fixed at 1.0)
*
* Prereq: scripts/server.sh start (or `npm start`) running on the configured port.
*/
import * as fs from 'fs';
import * as path from 'path';
import { parse as parseYaml } from 'yaml';
import { startFixtureServer } from '../src/bench/fixture-server.js';
import { runOneTask } from '../src/bench/runner.js';
import {
computeTotal,
gradeChecklist,
gradeInstructionsProgrammatic,
gradeTools,
} from '../src/bench/grader.js';
import { gradeReasoning, loadJudgeConfigFromEnv } from '../src/bench/judge.js';
import { writeRunSummary } from '../src/bench/summary.js';
import type { BenchResult, BenchTask, RawJobResult } from '../src/bench/types.js';
import { loadConfig } from '../src/config.js';
import { logger } from '../src/logger.js';
interface Args {
taskFilter?: string;
serverUrl: string;
benchRoot: string;
resultDir: string;
}
function parseArgs(): Args {
const argv = process.argv.slice(2);
let taskFilter: string | undefined;
let serverUrl = process.env['BENCH_SERVER_URL'] ?? 'http://127.0.0.1:9876';
for (const a of argv) {
if (a.startsWith('--task=')) taskFilter = a.slice('--task='.length);
else if (a.startsWith('--server=')) serverUrl = a.slice('--server='.length);
}
const benchRoot = path.resolve(process.cwd(), 'bench');
const runId = new Date().toISOString().replace(/[:.]/g, '-');
const resultDir = path.join(benchRoot, 'results', runId);
return { taskFilter, serverUrl, benchRoot, resultDir };
}
function loadTasks(benchRoot: string, filter?: string): BenchTask[] {
const taskDir = path.join(benchRoot, 'tasks');
if (!fs.existsSync(taskDir)) return [];
const files = fs.readdirSync(taskDir).filter((f) => f.endsWith('.yaml') || f.endsWith('.yml'));
const tasks: BenchTask[] = [];
for (const f of files) {
const parsed = parseYaml(fs.readFileSync(path.join(taskDir, f), 'utf-8')) as BenchTask;
if (filter && parsed.id !== filter) continue;
tasks.push(parsed);
}
return tasks;
}
function copyWorkspaceArtifacts(rawWorkspace: string, dest: string): void {
if (!rawWorkspace || !fs.existsSync(rawWorkspace)) return;
fs.mkdirSync(dest, { recursive: true });
function copyDir(src: string, target: string): void {
fs.mkdirSync(target, { recursive: true });
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
const s = path.join(src, entry.name);
const t = path.join(target, entry.name);
if (entry.isDirectory()) copyDir(s, t);
else if (entry.isFile()) {
try {
fs.copyFileSync(s, t);
} catch (err) {
logger.warn(`[bench] copy failed ${s} -> ${t}: ${(err as Error).message}`);
}
}
}
}
for (const sub of ['logs', 'output']) {
const src = path.join(rawWorkspace, sub);
if (fs.existsSync(src)) copyDir(src, path.join(dest, sub));
}
}
async function main(): Promise<void> {
const args = parseArgs();
const config = loadConfig();
const fallbackWorker = config.provider.workers?.[0];
const judgeConfig = loadJudgeConfigFromEnv({
endpoint: fallbackWorker?.endpoint ?? config.provider.baseUrl ?? 'http://127.0.0.1:11434/v1',
model: fallbackWorker?.model ?? config.provider.model ?? '',
apiKey: config.provider.apiKey ?? undefined,
});
const tasks = loadTasks(args.benchRoot, args.taskFilter);
if (tasks.length === 0) {
logger.error(`[bench] no tasks found (filter=${args.taskFilter ?? 'none'})`);
process.exit(1);
}
fs.mkdirSync(args.resultDir, { recursive: true });
logger.info(`[bench] run dir: ${args.resultDir}`);
logger.info(`[bench] server: ${args.serverUrl}`);
logger.info(`[bench] judge: ${judgeConfig ? `${judgeConfig.endpoint} model=${judgeConfig.model}` : 'off'}`);
const fixtureServer = await startFixtureServer(path.join(args.benchRoot, 'fixtures', 'web'));
const results: BenchResult[] = [];
try {
for (const task of tasks) {
const startedAt = new Date().toISOString();
logger.info(`[bench] === task: ${task.id} ===`);
let raw: RawJobResult;
try {
raw = await runOneTask(task, {
serverUrl: args.serverUrl,
benchRoot: args.benchRoot,
webPort: fixtureServer.port,
});
} catch (err) {
logger.error(`[bench] runner failed for ${task.id}: ${(err as Error).message}`);
continue;
}
const tools = gradeTools(task, raw);
const checklist = gradeChecklist(task, raw);
const instructions = gradeInstructionsProgrammatic(task, raw);
const reasoning = await gradeReasoning(task, raw, judgeConfig);
const finishedAt = new Date().toISOString();
const result: BenchResult = {
taskId: task.id,
taskTitle: task.title,
startedAt,
finishedAt,
raw,
axes: { tools, checklist, instructions, reasoning },
total: 0,
};
result.total = computeTotal(result.axes);
const taskResultDir = path.join(args.resultDir, task.id);
fs.mkdirSync(taskResultDir, { recursive: true });
copyWorkspaceArtifacts(raw.workspacePath, path.join(taskResultDir, 'workspace'));
fs.writeFileSync(path.join(taskResultDir, 'result.json'), JSON.stringify(result, null, 2), 'utf-8');
results.push(result);
}
} finally {
await fixtureServer.close();
}
const summaryPath = writeRunSummary(args.resultDir, results);
logger.info(`[bench] wrote ${summaryPath}`);
// Concise stdout summary so CI / humans see totals without opening the file.
for (const r of results) {
// eslint-disable-next-line no-console
console.log(`${r.taskId}: total=${r.total}/100 status=${r.raw.status} tools=${(r.axes.tools.score * 100).toFixed(0)}% checklist=${(r.axes.checklist.score * 100).toFixed(0)}% instr=${(r.axes.instructions.score * 100).toFixed(0)}% reasoning=${(r.axes.reasoning.score * 100).toFixed(0)}%`);
}
}
main().catch((err) => {
// eslint-disable-next-line no-console
console.error(err);
process.exit(1);
});