#!/usr/bin/env -S npx tsx /** * CLI entry for the benchmark runner. * * npm run bench # all tasks * npm run bench -- --task=composite-mini-report * npm run bench -- --server=http://127.0.0.1:9876 * BENCH_JUDGE=off npm run bench # skip LLM judge (axis D fixed at 1.0) * * Prereq: scripts/server.sh start (or `npm start`) running on the configured port. */ import * as fs from 'fs'; import * as path from 'path'; import { parse as parseYaml } from 'yaml'; import { startFixtureServer } from '../src/bench/fixture-server.js'; import { runOneTask } from '../src/bench/runner.js'; import { computeTotal, gradeChecklist, gradeInstructionsProgrammatic, gradeTools, } from '../src/bench/grader.js'; import { gradeReasoning, loadJudgeConfigFromEnv } from '../src/bench/judge.js'; import { writeRunSummary } from '../src/bench/summary.js'; import type { BenchResult, BenchTask, RawJobResult } from '../src/bench/types.js'; import { loadConfig } from '../src/config.js'; import { logger } from '../src/logger.js'; interface Args { taskFilter?: string; serverUrl: string; benchRoot: string; resultDir: string; } function parseArgs(): Args { const argv = process.argv.slice(2); let taskFilter: string | undefined; let serverUrl = process.env['BENCH_SERVER_URL'] ?? 'http://127.0.0.1:9876'; for (const a of argv) { if (a.startsWith('--task=')) taskFilter = a.slice('--task='.length); else if (a.startsWith('--server=')) serverUrl = a.slice('--server='.length); } const benchRoot = path.resolve(process.cwd(), 'bench'); const runId = new Date().toISOString().replace(/[:.]/g, '-'); const resultDir = path.join(benchRoot, 'results', runId); return { taskFilter, serverUrl, benchRoot, resultDir }; } function loadTasks(benchRoot: string, filter?: string): BenchTask[] { const taskDir = path.join(benchRoot, 'tasks'); if (!fs.existsSync(taskDir)) return []; const files = fs.readdirSync(taskDir).filter((f) => f.endsWith('.yaml') || f.endsWith('.yml')); const tasks: BenchTask[] = []; for (const f of files) { const parsed = parseYaml(fs.readFileSync(path.join(taskDir, f), 'utf-8')) as BenchTask; if (filter && parsed.id !== filter) continue; tasks.push(parsed); } return tasks; } function copyWorkspaceArtifacts(rawWorkspace: string, dest: string): void { if (!rawWorkspace || !fs.existsSync(rawWorkspace)) return; fs.mkdirSync(dest, { recursive: true }); function copyDir(src: string, target: string): void { fs.mkdirSync(target, { recursive: true }); for (const entry of fs.readdirSync(src, { withFileTypes: true })) { const s = path.join(src, entry.name); const t = path.join(target, entry.name); if (entry.isDirectory()) copyDir(s, t); else if (entry.isFile()) { try { fs.copyFileSync(s, t); } catch (err) { logger.warn(`[bench] copy failed ${s} -> ${t}: ${(err as Error).message}`); } } } } for (const sub of ['logs', 'output']) { const src = path.join(rawWorkspace, sub); if (fs.existsSync(src)) copyDir(src, path.join(dest, sub)); } } async function main(): Promise { const args = parseArgs(); const config = loadConfig(); const fallbackWorker = config.provider.workers?.[0]; const judgeConfig = loadJudgeConfigFromEnv({ endpoint: fallbackWorker?.endpoint ?? config.provider.baseUrl ?? 'http://127.0.0.1:11434/v1', model: fallbackWorker?.model ?? config.provider.model ?? '', apiKey: config.provider.apiKey ?? undefined, }); const tasks = loadTasks(args.benchRoot, args.taskFilter); if (tasks.length === 0) { logger.error(`[bench] no tasks found (filter=${args.taskFilter ?? 'none'})`); process.exit(1); } fs.mkdirSync(args.resultDir, { recursive: true }); logger.info(`[bench] run dir: ${args.resultDir}`); logger.info(`[bench] server: ${args.serverUrl}`); logger.info(`[bench] judge: ${judgeConfig ? `${judgeConfig.endpoint} model=${judgeConfig.model}` : 'off'}`); const fixtureServer = await startFixtureServer(path.join(args.benchRoot, 'fixtures', 'web')); const results: BenchResult[] = []; try { for (const task of tasks) { const startedAt = new Date().toISOString(); logger.info(`[bench] === task: ${task.id} ===`); let raw: RawJobResult; try { raw = await runOneTask(task, { serverUrl: args.serverUrl, benchRoot: args.benchRoot, webPort: fixtureServer.port, }); } catch (err) { logger.error(`[bench] runner failed for ${task.id}: ${(err as Error).message}`); continue; } const tools = gradeTools(task, raw); const checklist = gradeChecklist(task, raw); const instructions = gradeInstructionsProgrammatic(task, raw); const reasoning = await gradeReasoning(task, raw, judgeConfig); const finishedAt = new Date().toISOString(); const result: BenchResult = { taskId: task.id, taskTitle: task.title, startedAt, finishedAt, raw, axes: { tools, checklist, instructions, reasoning }, total: 0, }; result.total = computeTotal(result.axes); const taskResultDir = path.join(args.resultDir, task.id); fs.mkdirSync(taskResultDir, { recursive: true }); copyWorkspaceArtifacts(raw.workspacePath, path.join(taskResultDir, 'workspace')); fs.writeFileSync(path.join(taskResultDir, 'result.json'), JSON.stringify(result, null, 2), 'utf-8'); results.push(result); } } finally { await fixtureServer.close(); } const summaryPath = writeRunSummary(args.resultDir, results); logger.info(`[bench] wrote ${summaryPath}`); // Concise stdout summary so CI / humans see totals without opening the file. for (const r of results) { // eslint-disable-next-line no-console console.log(`${r.taskId}: total=${r.total}/100 status=${r.raw.status} tools=${(r.axes.tools.score * 100).toFixed(0)}% checklist=${(r.axes.checklist.score * 100).toFixed(0)}% instr=${(r.axes.instructions.score * 100).toFixed(0)}% reasoning=${(r.axes.reasoning.score * 100).toFixed(0)}%`); } } main().catch((err) => { // eslint-disable-next-line no-console console.error(err); process.exit(1); });