163 lines
6.1 KiB
TypeScript
163 lines
6.1 KiB
TypeScript
#!/usr/bin/env -S npx tsx
|
|
/**
|
|
* CLI entry for the benchmark runner.
|
|
*
|
|
* npm run bench # all tasks
|
|
* npm run bench -- --task=composite-mini-report
|
|
* npm run bench -- --server=http://127.0.0.1:9876
|
|
* BENCH_JUDGE=off npm run bench # skip LLM judge (axis D fixed at 1.0)
|
|
*
|
|
* Prereq: scripts/server.sh start (or `npm start`) running on the configured port.
|
|
*/
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import { parse as parseYaml } from 'yaml';
|
|
import { startFixtureServer } from '../src/bench/fixture-server.js';
|
|
import { runOneTask } from '../src/bench/runner.js';
|
|
import {
|
|
computeTotal,
|
|
gradeChecklist,
|
|
gradeInstructionsProgrammatic,
|
|
gradeTools,
|
|
} from '../src/bench/grader.js';
|
|
import { gradeReasoning, loadJudgeConfigFromEnv } from '../src/bench/judge.js';
|
|
import { writeRunSummary } from '../src/bench/summary.js';
|
|
import type { BenchResult, BenchTask, RawJobResult } from '../src/bench/types.js';
|
|
import { loadConfig } from '../src/config.js';
|
|
import { logger } from '../src/logger.js';
|
|
|
|
interface Args {
|
|
taskFilter?: string;
|
|
serverUrl: string;
|
|
benchRoot: string;
|
|
resultDir: string;
|
|
}
|
|
|
|
function parseArgs(): Args {
|
|
const argv = process.argv.slice(2);
|
|
let taskFilter: string | undefined;
|
|
let serverUrl = process.env['BENCH_SERVER_URL'] ?? 'http://127.0.0.1:9876';
|
|
for (const a of argv) {
|
|
if (a.startsWith('--task=')) taskFilter = a.slice('--task='.length);
|
|
else if (a.startsWith('--server=')) serverUrl = a.slice('--server='.length);
|
|
}
|
|
const benchRoot = path.resolve(process.cwd(), 'bench');
|
|
const runId = new Date().toISOString().replace(/[:.]/g, '-');
|
|
const resultDir = path.join(benchRoot, 'results', runId);
|
|
return { taskFilter, serverUrl, benchRoot, resultDir };
|
|
}
|
|
|
|
function loadTasks(benchRoot: string, filter?: string): BenchTask[] {
|
|
const taskDir = path.join(benchRoot, 'tasks');
|
|
if (!fs.existsSync(taskDir)) return [];
|
|
const files = fs.readdirSync(taskDir).filter((f) => f.endsWith('.yaml') || f.endsWith('.yml'));
|
|
const tasks: BenchTask[] = [];
|
|
for (const f of files) {
|
|
const parsed = parseYaml(fs.readFileSync(path.join(taskDir, f), 'utf-8')) as BenchTask;
|
|
if (filter && parsed.id !== filter) continue;
|
|
tasks.push(parsed);
|
|
}
|
|
return tasks;
|
|
}
|
|
|
|
function copyWorkspaceArtifacts(rawWorkspace: string, dest: string): void {
|
|
if (!rawWorkspace || !fs.existsSync(rawWorkspace)) return;
|
|
fs.mkdirSync(dest, { recursive: true });
|
|
function copyDir(src: string, target: string): void {
|
|
fs.mkdirSync(target, { recursive: true });
|
|
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
|
|
const s = path.join(src, entry.name);
|
|
const t = path.join(target, entry.name);
|
|
if (entry.isDirectory()) copyDir(s, t);
|
|
else if (entry.isFile()) {
|
|
try {
|
|
fs.copyFileSync(s, t);
|
|
} catch (err) {
|
|
logger.warn(`[bench] copy failed ${s} -> ${t}: ${(err as Error).message}`);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for (const sub of ['logs', 'output']) {
|
|
const src = path.join(rawWorkspace, sub);
|
|
if (fs.existsSync(src)) copyDir(src, path.join(dest, sub));
|
|
}
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
const args = parseArgs();
|
|
const config = loadConfig();
|
|
const fallbackWorker = config.provider.workers?.[0];
|
|
const judgeConfig = loadJudgeConfigFromEnv({
|
|
endpoint: fallbackWorker?.endpoint ?? config.provider.baseUrl ?? 'http://127.0.0.1:11434/v1',
|
|
model: fallbackWorker?.model ?? config.provider.model ?? '',
|
|
apiKey: config.provider.apiKey ?? undefined,
|
|
});
|
|
|
|
const tasks = loadTasks(args.benchRoot, args.taskFilter);
|
|
if (tasks.length === 0) {
|
|
logger.error(`[bench] no tasks found (filter=${args.taskFilter ?? 'none'})`);
|
|
process.exit(1);
|
|
}
|
|
fs.mkdirSync(args.resultDir, { recursive: true });
|
|
logger.info(`[bench] run dir: ${args.resultDir}`);
|
|
logger.info(`[bench] server: ${args.serverUrl}`);
|
|
logger.info(`[bench] judge: ${judgeConfig ? `${judgeConfig.endpoint} model=${judgeConfig.model}` : 'off'}`);
|
|
|
|
const fixtureServer = await startFixtureServer(path.join(args.benchRoot, 'fixtures', 'web'));
|
|
const results: BenchResult[] = [];
|
|
try {
|
|
for (const task of tasks) {
|
|
const startedAt = new Date().toISOString();
|
|
logger.info(`[bench] === task: ${task.id} ===`);
|
|
let raw: RawJobResult;
|
|
try {
|
|
raw = await runOneTask(task, {
|
|
serverUrl: args.serverUrl,
|
|
benchRoot: args.benchRoot,
|
|
webPort: fixtureServer.port,
|
|
});
|
|
} catch (err) {
|
|
logger.error(`[bench] runner failed for ${task.id}: ${(err as Error).message}`);
|
|
continue;
|
|
}
|
|
const tools = gradeTools(task, raw);
|
|
const checklist = gradeChecklist(task, raw);
|
|
const instructions = gradeInstructionsProgrammatic(task, raw);
|
|
const reasoning = await gradeReasoning(task, raw, judgeConfig);
|
|
const finishedAt = new Date().toISOString();
|
|
const result: BenchResult = {
|
|
taskId: task.id,
|
|
taskTitle: task.title,
|
|
startedAt,
|
|
finishedAt,
|
|
raw,
|
|
axes: { tools, checklist, instructions, reasoning },
|
|
total: 0,
|
|
};
|
|
result.total = computeTotal(result.axes);
|
|
const taskResultDir = path.join(args.resultDir, task.id);
|
|
fs.mkdirSync(taskResultDir, { recursive: true });
|
|
copyWorkspaceArtifacts(raw.workspacePath, path.join(taskResultDir, 'workspace'));
|
|
fs.writeFileSync(path.join(taskResultDir, 'result.json'), JSON.stringify(result, null, 2), 'utf-8');
|
|
results.push(result);
|
|
}
|
|
} finally {
|
|
await fixtureServer.close();
|
|
}
|
|
|
|
const summaryPath = writeRunSummary(args.resultDir, results);
|
|
logger.info(`[bench] wrote ${summaryPath}`);
|
|
// Concise stdout summary so CI / humans see totals without opening the file.
|
|
for (const r of results) {
|
|
// eslint-disable-next-line no-console
|
|
console.log(`${r.taskId}: total=${r.total}/100 status=${r.raw.status} tools=${(r.axes.tools.score * 100).toFixed(0)}% checklist=${(r.axes.checklist.score * 100).toFixed(0)}% instr=${(r.axes.instructions.score * 100).toFixed(0)}% reasoning=${(r.axes.reasoning.score * 100).toFixed(0)}%`);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
// eslint-disable-next-line no-console
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|