maestro/src/engine/piece-runner.test.ts
2026-06-04 03:03:12 +00:00

1161 lines
45 KiB
TypeScript

import { execFileSync } from 'child_process';
import { tmpdir } from 'os';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'fs';
import { join } from 'path';
import type { MovementResult } from './agent-loop.js';
import type { PieceDef } from './piece-runner.js';
vi.mock('./agent-loop.js', () => ({
executeMovement: vi.fn(),
}));
import { executeMovement } from './agent-loop.js';
import { loadPiece, runPiece, normalizeRequiredMcp, validatePieceDef, validateAllowedSshConnections } from './piece-runner.js';
const executeMovementMock = vi.mocked(executeMovement);
function makePiece(): PieceDef {
return {
name: 'test-piece',
description: 'test',
max_movements: 10,
initial_movement: 'execute',
movements: [
{
name: 'execute',
edit: true,
persona: 'worker',
instruction: 'execute',
allowed_tools: [],
rules: [],
default_next: 'verify',
},
{
name: 'verify',
edit: false,
persona: 'reviewer',
instruction: 'verify',
allowed_tools: [],
rules: [],
default_next: 'execute',
},
{
name: 'analyze',
edit: true,
persona: 'analyst',
instruction: 'analyze',
allowed_tools: [],
rules: [],
default_next: 'COMPLETE',
},
{
name: 'plan',
edit: false,
persona: 'planner',
instruction: 'plan',
allowed_tools: [],
rules: [],
default_next: 'analyze',
},
{
name: 'verify',
edit: false,
persona: 'reviewer',
instruction: 'verify',
allowed_tools: [],
rules: [],
default_next: 'plan',
},
],
};
}
function makeWorkspace(): string {
return mkdtempSync(join(tmpdir(), 'piece-runner-test-'));
}
function makeGitWorkspace(): string {
const workspace = makeWorkspace();
execFileSync('git', ['init', '--initial-branch=main'], { cwd: workspace });
execFileSync('git', ['config', 'user.name', 'Test User'], { cwd: workspace });
execFileSync('git', ['config', 'user.email', 'test@example.com'], { cwd: workspace });
writeFileSync(join(workspace, 'README.md'), 'line1\nline2\n', 'utf-8');
execFileSync('git', ['add', 'README.md'], { cwd: workspace });
execFileSync('git', ['commit', '-m', 'init'], { cwd: workspace });
return workspace;
}
describe('piece-runner review feedback flow', () => {
let workspacePath = '';
beforeEach(() => {
executeMovementMock.mockReset();
});
afterEach(() => {
if (workspacePath) {
rmSync(workspacePath, { recursive: true, force: true });
workspacePath = '';
}
});
it('carries cumulative verify feedback into later execute/analyze movements', async () => {
workspacePath = makeWorkspace();
const instructions: string[] = [];
const results: MovementResult[] = [
{ next: 'verify', output: 'first draft', toolsUsed: [] },
{ next: 'execute', output: 'review 1: fix title', toolsUsed: [] },
{ next: 'verify', output: 'second draft', toolsUsed: [] },
{ next: 'analyze', output: 'review 2: add conclusion', toolsUsed: [] },
{ next: 'COMPLETE', output: 'done', toolsUsed: [] },
];
executeMovementMock.mockImplementation(async (_movement, instruction) => {
instructions.push(instruction);
const next = results.shift();
if (!next) throw new Error('no mock result left');
return next;
});
const result = await runPiece(makePiece(), 'TASK', {} as never, workspacePath);
expect(result.status).toBe('completed');
expect(instructions[0]).toBe('TASK');
expect(instructions[2]).toContain('これまでのレビュー指摘');
expect(instructions[2]).toContain('review 1: fix title');
expect(instructions[2]).not.toContain('review 2: add conclusion');
expect(instructions[4]).toContain('review 1: fix title');
expect(instructions[4]).toContain('review 2: add conclusion');
});
it('appends safe git status and diff context after verify loops', async () => {
workspacePath = makeGitWorkspace();
const instructions: string[] = [];
let callIndex = 0;
executeMovementMock.mockImplementation(async (_movement, instruction) => {
instructions.push(instruction);
if (callIndex === 0) {
mkdirSync(join(workspacePath, 'output'), { recursive: true });
mkdirSync(join(workspacePath, 'input'), { recursive: true });
mkdirSync(join(workspacePath, 'logs'), { recursive: true });
writeFileSync(join(workspacePath, 'README.md'), 'line1\nline2 changed\n', 'utf-8');
writeFileSync(join(workspacePath, 'output', 'report.md'), '# report\n', 'utf-8');
writeFileSync(join(workspacePath, 'input', 'noise.txt'), 'ignore me\n', 'utf-8');
writeFileSync(join(workspacePath, 'logs', 'runtime.log'), 'ignore me too\n', 'utf-8');
callIndex++;
return { next: 'verify', output: 'draft ready', toolsUsed: [] };
}
if (callIndex === 1) {
callIndex++;
return { next: 'execute', output: '[判定] needs_fix\n## 問題点\n- README.md: wording\n## 期待する修正\n- tighten wording', toolsUsed: [] };
}
callIndex++;
return { next: 'COMPLETE', output: 'done', toolsUsed: [] };
});
const result = await runPiece(makePiece(), 'TASK', {} as never, workspacePath);
expect(result.status).toBe('completed');
expect(instructions[2]).toContain('これまでのレビュー指摘');
expect(instructions[2]).toContain('## 現在の変更状況');
expect(instructions[2]).toContain('## 変更差分(抜粋)');
expect(instructions[2]).toContain('README.md');
expect(instructions[2]).toContain('output/report.md');
expect(instructions[2]).not.toContain('input/noise.txt');
expect(instructions[2]).not.toContain('logs/runtime.log');
});
it('aborts when loop detection fires due to consecutive revisits', async () => {
workspacePath = makeWorkspace();
// Movement always transitions back to itself: execute→execute→execute...
// This triggers the consecutive visit counter
executeMovementMock.mockResolvedValue({ next: 'execute', output: 'still going', toolsUsed: [] });
const piece = makePiece();
// Set a low max_consecutive_revisits so the test triggers quickly
for (const m of piece.movements) {
(m as any).max_consecutive_revisits = 2;
}
const result = await runPiece(piece, 'TASK', {} as never, workspacePath);
expect(result.status).toBe('aborted');
expect(result.abortReason).toBe('loop_detected');
expect(result.finalOutput).toContain('Loop detected');
});
it('falls back to default_next when ASK limit is reached', async () => {
workspacePath = makeWorkspace();
// First call: ASK, Second call: ASK (limit reached), Third call: COMPLETE from fallback movement
const results: MovementResult[] = [
{ next: 'ASK', output: 'Need info 1', toolsUsed: [] },
{ next: 'ASK', output: 'Need info 2', toolsUsed: [] },
// After ASK limit, piece-runner should fall back to default_next ("verify")
// verify returns COMPLETE
{ next: 'COMPLETE', output: 'done from fallback', toolsUsed: [] },
];
executeMovementMock.mockImplementation(async () => {
const next = results.shift();
if (!next) throw new Error('no mock result left');
return next;
});
const result = await runPiece(
makePiece(),
'TASK',
{} as never,
workspacePath,
undefined,
undefined,
{ askCount: 0, maxAskPerJob: 1 },
);
// First ASK (askCount=1, maxAsk=1): returned to caller as waiting_human
expect(result.status).toBe('waiting_human');
expect(result.finalOutput).toBe('Need info 1');
});
it('aborts when ASK limit reached and no fallback transition exists', async () => {
workspacePath = makeWorkspace();
// Piece with a single movement that has no default_next and no rules with forward transitions
const singlePiece: PieceDef = {
name: 'test-single',
description: 'test',
max_movements: 10,
initial_movement: 'execute',
movements: [
{
name: 'execute',
edit: true,
persona: 'worker',
instruction: 'do work',
allowed_tools: [],
rules: [],
// No default_next
},
],
};
executeMovementMock.mockResolvedValue({ next: 'ASK', output: 'Need info', toolsUsed: [] });
const result = await runPiece(
singlePiece,
'TASK',
{} as never,
workspacePath,
undefined,
undefined,
{ askCount: 1, maxAskPerJob: 1 }, // Already at limit
);
// askCount starts at 1, +1 = 2 which > maxAsk=1, so it should try to find fallback
// No fallback exists, so it aborts
expect(result.status).toBe('aborted');
expect(result.abortReason).toBe('ask_limit_reached');
});
it('keeps piece YAML review prompts structured and plan-aware', () => {
const piecesDir = join(process.cwd(), 'pieces');
const general = loadPiece('general', piecesDir);
const office = loadPiece('office-process', piecesDir);
const research = loadPiece('research', piecesDir);
expect(general.movements.find((m) => m.name === 'verify')?.instruction).toContain('## 問題点');
expect(general.movements.find((m) => m.name === 'verify')?.instruction).toContain('## 合格基準');
expect(office.movements.find((m) => m.name === 'verify')?.instruction).toContain('## 期待する修正');
expect(office.movements.find((m) => m.name === 'process')?.instruction).toContain('合格基準');
expect(research.movements.find((m) => m.name === 'verify')?.instruction).toContain('## 問題点');
expect(research.movements.find((m) => m.name === 'analyze')?.instruction).toContain('合格基準');
// After Phase 6a-2: verify has a single rule (analyze fallback) since
// COMPLETE / ABORT / ASK terminals moved to the `complete` tool.
expect(research.movements.find((m) => m.name === 'verify')?.rules[0]?.next).toBe('analyze');
});
});
describe('loadPiece terminal-rule validation (Phase 6b)', () => {
let tempDir: string;
beforeEach(() => {
tempDir = mkdtempSync(join(tmpdir(), 'phase6b-loadpiece-'));
});
afterEach(() => {
rmSync(tempDir, { recursive: true, force: true });
});
function writePiece(name: string, body: string): void {
writeFileSync(join(tempDir, `${name}.yaml`), body, 'utf-8');
}
it('rejects custom piece with rules[].next: COMPLETE', () => {
writePiece('bad', `name: bad
description: terminal in rules
max_movements: 1
initial_movement: only
movements:
- name: only
edit: false
persona: p
instruction: i
allowed_tools: [Read]
default_next: COMPLETE
rules:
- condition: done
next: COMPLETE
`);
expect(() => loadPiece('bad', 'pieces', tempDir)).toThrow(/reserved terminal next values/);
});
it('rejects rules[].next: ABORT and ASK with the same error', () => {
writePiece('bad-abort', `name: bad-abort
description: x
max_movements: 1
initial_movement: only
movements:
- name: only
edit: false
persona: p
instruction: i
allowed_tools: [Read]
rules:
- condition: fail
next: ABORT
`);
expect(() => loadPiece('bad-abort', 'pieces', tempDir)).toThrow(/rule\.next="ABORT"/);
writePiece('bad-ask', `name: bad-ask
description: x
max_movements: 1
initial_movement: only
movements:
- name: only
edit: false
persona: p
instruction: i
allowed_tools: [Read]
rules:
- condition: ask
next: ASK
`);
expect(() => loadPiece('bad-ask', 'pieces', tempDir)).toThrow(/rule\.next="ASK"/);
});
it('accepts default_next: COMPLETE (engine-internal sentinel)', () => {
writePiece('good', `name: good
description: terminal only via default_next
max_movements: 1
initial_movement: only
movements:
- name: only
edit: false
persona: p
instruction: i
allowed_tools: [Read]
default_next: COMPLETE
rules: []
`);
const piece = loadPiece('good', 'pieces', tempDir);
expect(piece.movements[0]?.default_next).toBe('COMPLETE');
});
it('accepts movement-to-movement rules + WAIT_SUBTASKS sentinel', () => {
writePiece('multi', `name: multi
description: x
max_movements: 1
initial_movement: a
movements:
- name: a
edit: false
persona: p
instruction: i
allowed_tools: [Read]
default_next: b
rules:
- condition: spawn done
next: WAIT_SUBTASKS
- condition: do next
next: b
- name: b
edit: false
persona: p
instruction: i
allowed_tools: [Read]
default_next: COMPLETE
rules: []
`);
expect(() => loadPiece('multi', 'pieces', tempDir)).not.toThrow();
});
it('all 12 bundled pieces load without validation errors', () => {
const piecesDir = join(process.cwd(), 'pieces');
const names = ['brainstorming', 'chat', 'data-process', 'general',
'office-process', 'piece-builder', 'research', 'slide', 'sns-research',
'ssh-console', 'ssh-ops', 'x-ai-digest'];
for (const name of names) {
expect(() => loadPiece(name, piecesDir)).not.toThrow();
}
});
it('ssh-console piece declares SshConsole* tools and wildcard allowed_ssh_connections', () => {
const piece = loadPiece('ssh-console', join(process.cwd(), 'pieces'));
expect(piece.name).toBe('ssh-console');
expect(piece.movements).toHaveLength(1);
const interact = piece.movements[0]!;
expect(interact.name).toBe('interact');
expect(interact.allowed_tools).toEqual(expect.arrayContaining([
'SshConsoleEnsure', 'SshConsoleSend', 'SshConsoleSnapshot',
]));
expect(interact.allowed_ssh_connections).toEqual(['*']);
expect(interact.default_next).toBe('COMPLETE');
});
it('ssh-ops piece declares SSH tools and wildcard allowed_ssh_connections', () => {
const piece = loadPiece('ssh-ops', join(process.cwd(), 'pieces'));
const execute = piece.movements.find((m) => m.name === 'execute');
expect(execute).toBeDefined();
expect(execute!.allowed_tools).toEqual(expect.arrayContaining(['SshExec', 'SshUpload', 'SshDownload']));
expect(execute!.allowed_ssh_connections).toEqual(['*']);
const verify = piece.movements.find((m) => m.name === 'verify');
expect(verify).toBeDefined();
// verify has no SSH tools, so allowed_ssh_connections is optional and omitted.
expect(verify!.allowed_ssh_connections).toBeUndefined();
});
});
describe('runPiece max_movements defensive default', () => {
let workspace = '';
beforeEach(() => {
executeMovementMock.mockReset();
workspace = mkdtempSync(join(tmpdir(), 'mm-default-'));
});
afterEach(() => {
if (workspace) {
rmSync(workspace, { recursive: true, force: true });
workspace = '';
}
});
// Regression: a piece YAML missing max_movements (e.g. an LLM-corrupted
// override) used to make `while (steps < undefined)` false on the first
// iteration, aborting instantly with "Exceeded max movements (undefined)"
// before any movement ran.
it('still iterates when piece.max_movements is missing (falls back to default)', async () => {
executeMovementMock.mockResolvedValue({ next: 'COMPLETE', output: 'ok', toolsUsed: [] });
const piece = makePiece();
delete (piece as Partial<PieceDef>).max_movements;
const result = await runPiece(piece as PieceDef, 'TASK', {} as never, workspace);
expect(result.status).toBe('completed');
expect(executeMovementMock).toHaveBeenCalled();
});
it('still iterates when piece.max_movements is 0 or negative', async () => {
executeMovementMock.mockResolvedValue({ next: 'COMPLETE', output: 'ok', toolsUsed: [] });
const piece = makePiece();
(piece as PieceDef).max_movements = 0;
const result = await runPiece(piece, 'TASK', {} as never, workspace);
expect(result.status).toBe('completed');
expect(executeMovementMock).toHaveBeenCalled();
});
});
import { buildFollowupNotice } from './piece-runner.js';
describe('buildFollowupNotice (option C)', () => {
let workspace: string;
beforeEach(() => {
workspace = mkdtempSync(join(tmpdir(), 'followup-test-'));
});
afterEach(() => {
rmSync(workspace, { recursive: true, force: true });
});
it('returns empty string for a fresh workspace (no follow-up signal)', () => {
expect(buildFollowupNotice(workspace)).toBe('');
});
it('returns empty when output/ exists but is empty', () => {
mkdirSync(join(workspace, 'output'), { recursive: true });
expect(buildFollowupNotice(workspace)).toBe('');
});
it('detects follow-up when output/ has any non-hidden file', () => {
mkdirSync(join(workspace, 'output'), { recursive: true });
writeFileSync(join(workspace, 'output', 'report.md'), 'prior work', 'utf-8');
const notice = buildFollowupNotice(workspace);
expect(notice).toContain('【継続タスク】');
expect(notice).toContain('CreateChecklist');
expect(notice).toContain('2 回目以降');
});
it('detects follow-up when subtasks/ has content (multi-stage flows)', () => {
mkdirSync(join(workspace, 'subtasks', '1'), { recursive: true });
writeFileSync(join(workspace, 'subtasks', '1', 'placeholder.txt'), 'x', 'utf-8');
expect(buildFollowupNotice(workspace)).toContain('【継続タスク】');
});
it('ignores hidden / engine-internal files', () => {
mkdirSync(join(workspace, 'output'), { recursive: true });
// Phase 5 engine-internal artifacts must NOT count as follow-up signal,
// otherwise the very first run would incorrectly self-flag.
writeFileSync(join(workspace, 'output', 'memory-delta.json'), '{}', 'utf-8');
writeFileSync(join(workspace, 'output', '.gitkeep'), '', 'utf-8');
expect(buildFollowupNotice(workspace)).toBe('');
});
});
// ============================================================
// Traceability T-2 — handoff / delta / followup / context_action
// ============================================================
import { runPiece } from './piece-runner.js';
import { readFileSync } from 'fs';
import { createFileEventLogger, parseEventLine, type EventBase } from '../progress/event-log.js';
import type { OpenAICompatClient, LLMEvent } from '../llm/openai-compat.js';
vi.mock('./agent-loop.js', () => ({
executeMovement: vi.fn(),
}));
function readAllEvents(workspacePath: string): EventBase[] {
const path = join(workspacePath, 'logs', 'events.jsonl');
if (!existsSyncEvents(path)) return [];
const lines = readFileSync(path, 'utf-8').trim().split('\n').filter(Boolean);
return lines.map((l) => {
const r = parseEventLine(l);
if (r.kind !== 'ok') throw new Error(`bad event: ${l}`);
return r.event;
});
}
import { existsSync as existsSyncEvents } from 'fs';
describe('Traceability T-2: piece-runner emission for subtask boundary + followup', () => {
let workspace: string;
beforeEach(() => {
workspace = mkdtempSync(join(tmpdir(), 'trace-t2-'));
});
afterEach(() => {
rmSync(workspace, { recursive: true, force: true });
vi.mocked(executeMovement).mockReset();
});
it('emits memory_handoff_read when a parent handoff exists at startup', async () => {
// Simulate a parent handoff already in the workspace.
mkdirSync(join(workspace, 'input'), { recursive: true });
writeFileSync(
join(workspace, 'input', 'memory-handoff.json'),
JSON.stringify({
version: 1,
handoffId: 'h-1',
parentJobId: 'parent-job-1',
parentWorkspaceRelative: '../..',
createdAt: '2026-05-02T00:00:00.000Z',
facts: [{ claim: 'parent X', confidence: 'high', evidencePaths: [], evidenceUrls: [], observedAt: '2026-05-02T00:00:00.000Z', portability: 'portable', evidenceKind: 'none', lineage: [] }],
decisions: [],
openQuestions: [],
doNotRepeat: [],
}),
'utf-8',
);
vi.mocked(executeMovement).mockResolvedValue({
next: 'COMPLETE', output: 'done', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
const fakeClient = {} as OpenAICompatClient;
await runPiece(piece, 'task', fakeClient, workspace);
const events = readAllEvents(workspace);
const handoffRead = events.find((e) => e.kind === 'memory_handoff_read');
expect(handoffRead).toBeDefined();
const payload = handoffRead?.payload as { parentJobId: string };
expect(payload.parentJobId).toBe('parent-job-1');
});
it('emits followup_detected when output/ has prior content', async () => {
mkdirSync(join(workspace, 'output'), { recursive: true });
writeFileSync(join(workspace, 'output', 'prior.md'), 'previous turn output', 'utf-8');
vi.mocked(executeMovement).mockResolvedValue({
next: 'COMPLETE', output: 'done', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
const events = readAllEvents(workspace);
expect(events.some((e) => e.kind === 'followup_detected')).toBe(true);
});
it('emits memory_delta_absorb (skipped_already_absorbed) when re-resuming', async () => {
// Pre-seed a child delta + an absorbed-deltas log saying it's already done.
const childWs = join(workspace, 'subtasks', '1');
mkdirSync(join(childWs, 'output'), { recursive: true });
writeFileSync(join(childWs, 'output', 'memory-delta.json'), JSON.stringify({
version: 1,
deltaId: 'd-1',
childJobId: 'child-1',
childWorkspaceRelative: 'subtasks/1',
childStatus: 'success',
partial: false,
createdAt: '2026-05-02T00:00:00.000Z',
facts: [{ claim: 'child finding', confidence: 'high', evidencePaths: [], evidenceUrls: [], observedAt: '2026-05-02T00:00:00.000Z', portability: 'portable', evidenceKind: 'none', lineage: [] }],
decisions: [], openQuestions: [], doNotRepeat: [],
}), 'utf-8');
mkdirSync(join(workspace, 'logs'), { recursive: true });
writeFileSync(join(workspace, 'logs', 'absorbed-deltas.json'), JSON.stringify({ version: 1, ids: ['d-1'] }), 'utf-8');
vi.mocked(executeMovement).mockResolvedValue({
next: 'COMPLETE', output: 'done', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
const events = readAllEvents(workspace);
const absorb = events.find((e) => e.kind === 'memory_delta_absorb');
expect(absorb).toBeDefined();
const payload = absorb?.payload as { outcome: string };
expect(payload.outcome).toBe('skipped_already_absorbed');
});
it('emits memory_delta_absorb (merged) and counts when a fresh delta is found', async () => {
const childWs = join(workspace, 'subtasks', '1');
mkdirSync(join(childWs, 'output'), { recursive: true });
writeFileSync(join(childWs, 'output', 'memory-delta.json'), JSON.stringify({
version: 1,
deltaId: 'd-2',
childJobId: 'child-2',
childWorkspaceRelative: 'subtasks/1',
childStatus: 'success',
partial: false,
createdAt: '2026-05-02T00:00:00.000Z',
facts: [{ claim: 'child A', confidence: 'high', evidencePaths: ['output/a.ts'], evidenceUrls: [], observedAt: '2026-05-02T00:00:00.000Z', portability: 'workspace_local', evidenceKind: 'local_path', lineage: [] }],
decisions: [], openQuestions: [], doNotRepeat: [],
}), 'utf-8');
vi.mocked(executeMovement).mockResolvedValue({
next: 'COMPLETE', output: 'done', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
const events = readAllEvents(workspace);
const absorb = events.find((e) => e.kind === 'memory_delta_absorb' && (e.payload as { outcome: string }).outcome === 'merged');
expect(absorb).toBeDefined();
const payload = absorb?.payload as { counts: { factsAdded: number } };
expect(payload.counts.factsAdded).toBe(1);
});
it('emits run_start and run_complete bookending each piece run', async () => {
vi.mocked(executeMovement).mockResolvedValue({
next: 'COMPLETE', output: 'done', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
const events = readAllEvents(workspace);
expect(events[0]!.kind).toBe('run_start');
expect(events[events.length - 1]!.kind).toBe('run_complete');
const completePayload = events[events.length - 1]!.payload as { status: string };
expect(completePayload.status).toBe('completed');
});
});
import { existsSync as existsSyncSnapshot, readdirSync as readdirSnapshot } from 'fs';
describe('Cancel-traceability PR1: memory snapshot on terminal non-success', () => {
let workspace: string;
beforeEach(() => {
workspace = mkdtempSync(join(tmpdir(), 'cancel-snap-'));
});
afterEach(() => {
rmSync(workspace, { recursive: true, force: true });
vi.mocked(executeMovement).mockReset();
});
function findSnapshotFiles(): string[] {
const dir = join(workspace, 'logs');
if (!existsSyncSnapshot(dir)) return [];
return readdirSnapshot(dir).filter((f) => f.startsWith('memory-snapshot-') && f.endsWith('.json'));
}
it('writes snapshot + meta-event when cancelled before any movement', async () => {
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 3, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
// cancel BEFORE movement runs: cancelCheck returns true on first guard.
const result = await runPiece(piece, 'task', {} as OpenAICompatClient, workspace, undefined, undefined, { cancelCheck: () => true });
expect(result.status).toBe('cancelled');
expect(result.memorySnapshotPath).toBeDefined();
expect(result.memorySnapshotPath).toMatch(/^logs\/memory-snapshot-cancelled-/);
const files = findSnapshotFiles();
expect(files.length).toBe(1);
expect(files[0]).toMatch(/^memory-snapshot-cancelled-.*\.json$/);
const fileContent = JSON.parse(readFileSync(join(workspace, 'logs', files[0]!), 'utf-8'));
expect(fileContent.schemaVersion).toBe(2);
expect(fileContent.status).toBe('cancelled');
expect(fileContent.memory).toBeDefined();
expect(fileContent.memory.facts).toEqual([]);
expect(fileContent.runId).toBeDefined();
// v2 forensics fields
expect(fileContent.finalOutput).toBeDefined();
expect(Array.isArray(fileContent.movementHistory)).toBe(true);
expect(Array.isArray(fileContent.lessons)).toBe(true);
expect(Array.isArray(fileContent.contextActions)).toBe(true);
expect(fileContent.stats).toBeDefined();
expect(typeof fileContent.stats.totalSteps).toBe('number');
expect(fileContent.eventsLogRelative).toBe('logs/events.jsonl');
const events = readAllEvents(workspace);
const written = events.find((e) => e.kind === 'memory_snapshot_written');
expect(written).toBeDefined();
expect((written!.payload as { status: string }).status).toBe('cancelled');
expect((written!.payload as { path: string }).path).toMatch(/^logs\/memory-snapshot-cancelled-/);
const runComplete = events[events.length - 1]!;
expect(runComplete.kind).toBe('run_complete');
const payload = runComplete.payload as { status: string; cancel?: { phase: string; snapshotPath: string }; memorySnapshotPath?: string };
expect(payload.status).toBe('cancelled');
expect(payload.memorySnapshotPath).toBeDefined();
expect(payload.cancel?.phase).toBe('before_movement');
expect(payload.cancel?.snapshotPath).toBe(payload.memorySnapshotPath);
});
it('writes snapshot when cancelled mid-movement (ABORT with cancelled output)', async () => {
vi.mocked(executeMovement).mockResolvedValue({
next: 'ABORT', output: 'Job was cancelled by user request', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 3, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
const result = await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
expect(result.status).toBe('cancelled');
expect(result.memorySnapshotPath).toBeDefined();
const events = readAllEvents(workspace);
const runComplete = events[events.length - 1]!;
expect(runComplete.kind).toBe('run_complete');
const payload = runComplete.payload as { cancel?: { phase: string; movement: string } };
expect(payload.cancel?.phase).toBe('mid_movement');
expect(payload.cancel?.movement).toBe('m');
});
it('writes snapshot on aborted (max_movements exceeded)', async () => {
// Always return next='m' to bounce back, hitting max_movements.
vi.mocked(executeMovement).mockResolvedValue({
next: 'm', output: 'still working', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{
name: 'm', edit: false, persona: 'p', instruction: 'i',
allowed_tools: [],
rules: [{ condition: 'always', next: 'm' }],
default_next: 'COMPLETE',
max_consecutive_revisits: 100,
}],
};
const result = await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
expect(result.status).toBe('aborted');
expect(result.memorySnapshotPath).toBeDefined();
expect(result.memorySnapshotPath).toMatch(/^logs\/memory-snapshot-aborted-/);
const events = readAllEvents(workspace);
const written = events.find((e) => e.kind === 'memory_snapshot_written');
expect(written).toBeDefined();
expect((written!.payload as { status: string }).status).toBe('aborted');
});
it('v2 snapshot captures finalOutput / movementHistory / lessons on agent-self-abort', async () => {
// Simulate `complete({status:'aborted', abort_reason:'...'})` — the
// forensics gap that motivated schemaVersion=2. The LLM's abort_reason
// surfaces as MovementResult.output and PieceRunResult.finalOutput; v1
// dropped it entirely from the snapshot.
const abortReasonText = 'Cannot proceed: required input file is missing and user is unavailable';
vi.mocked(executeMovement).mockResolvedValue({
next: 'ABORT',
output: abortReasonText,
toolsUsed: ['Read', 'Glob'],
lessons: 'Lesson: validate input presence before plan phase',
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 3, initial_movement: 'respond',
movements: [{
name: 'respond', edit: false, persona: 'p', instruction: 'i',
allowed_tools: [], rules: [], default_next: 'COMPLETE',
}],
};
const result = await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
expect(result.status).toBe('aborted');
expect(result.abortReason).toBe('movement_abort');
expect(result.memorySnapshotPath).toBeDefined();
const files = findSnapshotFiles();
expect(files.length).toBe(1);
const fileContent = JSON.parse(readFileSync(join(workspace, 'logs', files[0]!), 'utf-8'));
expect(fileContent.schemaVersion).toBe(2);
expect(fileContent.status).toBe('aborted');
expect(fileContent.abortReason).toBe('movement_abort');
expect(fileContent.currentMovement).toBe('respond');
// The LLM's abort_reason text — the most important forensic field — is
// now preserved verbatim at top level.
expect(fileContent.finalOutput).toBe(abortReasonText);
// movementHistory shows the path taken with per-step tool usage.
expect(Array.isArray(fileContent.movementHistory)).toBe(true);
expect(fileContent.movementHistory.length).toBe(1);
expect(fileContent.movementHistory[0]).toMatchObject({
name: 'respond',
next: 'ABORT',
toolsUsed: ['Read', 'Glob'],
outputPreview: abortReasonText,
outputTruncated: false,
hasLessons: true,
});
// lessons accumulated from movement results are persisted.
expect(Array.isArray(fileContent.lessons)).toBe(true);
expect(fileContent.lessons.length).toBe(1);
expect(fileContent.lessons[0].movement).toBe('respond');
expect(fileContent.lessons[0].lessons).toContain('validate input presence');
expect(fileContent.stats.totalSteps).toBe(1);
expect(fileContent.stats.movementCount).toBe(1);
expect(fileContent.eventsLogRelative).toBe('logs/events.jsonl');
});
it('v2 snapshot truncates long movement outputs but preserves finalOutput in full', async () => {
const longOutput = 'x'.repeat(2_000);
vi.mocked(executeMovement).mockResolvedValue({
next: 'ABORT',
output: longOutput,
toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 3, initial_movement: 'm',
movements: [{
name: 'm', edit: false, persona: 'p', instruction: 'i',
allowed_tools: [], rules: [], default_next: 'COMPLETE',
}],
};
const result = await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
expect(result.status).toBe('aborted');
const files = findSnapshotFiles();
const fileContent = JSON.parse(readFileSync(join(workspace, 'logs', files[0]!), 'utf-8'));
expect(fileContent.movementHistory[0].outputPreview.length).toBe(500);
expect(fileContent.movementHistory[0].outputTruncated).toBe(true);
// finalOutput keeps the full text uncapped so the LLM's reasoning isn't lost.
expect(fileContent.finalOutput).toBe(longOutput);
});
it('does NOT write snapshot on successful completion', async () => {
vi.mocked(executeMovement).mockResolvedValue({
next: 'COMPLETE', output: 'done', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [], default_next: 'COMPLETE' }],
};
const result = await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
expect(result.status).toBe('completed');
expect(result.memorySnapshotPath).toBeUndefined();
expect(findSnapshotFiles()).toEqual([]);
const events = readAllEvents(workspace);
expect(events.find((e) => e.kind === 'memory_snapshot_written')).toBeUndefined();
});
it('does NOT write snapshot on waiting_subtasks (transient pause)', async () => {
vi.mocked(executeMovement).mockResolvedValue({
next: 'WAIT_SUBTASKS', output: 'spawned children', toolsUsed: [],
});
const piece: PieceDef = {
name: 'tester', description: 'd', max_movements: 1, initial_movement: 'm',
movements: [{
name: 'm', edit: false, persona: 'p', instruction: 'i',
allowed_tools: [],
rules: [{ condition: 'spawned', next: 'WAIT_SUBTASKS' }],
default_next: 'COMPLETE',
}],
};
const result = await runPiece(piece, 'task', {} as OpenAICompatClient, workspace);
expect(result.status).toBe('waiting_subtasks');
expect(result.memorySnapshotPath).toBeUndefined();
expect(findSnapshotFiles()).toEqual([]);
});
});
describe('piece required_mcp parsing', () => {
function makePieceWithMcp(required_mcp: unknown): PieceDef {
return {
name: 'mcp-test',
description: 'test',
max_movements: 1,
initial_movement: 'm',
required_mcp: required_mcp as string[],
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [] }],
};
}
it('retains valid required_mcp slugs', () => {
const piece = makePieceWithMcp(['canva', 'notion']);
normalizeRequiredMcp(piece, 'mcp-test');
expect(piece.required_mcp).toEqual(['canva', 'notion']);
});
it('drops invalid slugs and keeps only valid ones', () => {
const piece = makePieceWithMcp(['canva', 'BAD!!', 123]);
normalizeRequiredMcp(piece, 'mcp-test');
expect(piece.required_mcp).toEqual(['canva']);
});
it('leaves required_mcp undefined when field is absent', () => {
const piece: PieceDef = {
name: 'mcp-test',
description: 'test',
max_movements: 1,
initial_movement: 'm',
movements: [{ name: 'm', edit: false, persona: 'p', instruction: 'i', allowed_tools: [], rules: [] }],
};
normalizeRequiredMcp(piece, 'mcp-test');
expect(piece.required_mcp).toBeUndefined();
});
it('normalizes required_mcp to empty array when field is not an array', () => {
const piece = makePieceWithMcp('not-an-array' as unknown as string[]);
normalizeRequiredMcp(piece, 'mcp-test');
expect(piece.required_mcp).toEqual([]);
});
});
// Phase 4: per-movement SSH connection allowlist validation.
describe('allowed_ssh_connections validation (Phase 4)', () => {
function makeMovement(overrides: Partial<PieceDef['movements'][number]> = {}): PieceDef['movements'][number] {
return {
name: 'm1',
edit: false,
persona: 'p',
instruction: 'i',
allowed_tools: [],
rules: [],
...overrides,
};
}
function makePiece(movements: PieceDef['movements']): PieceDef {
return {
name: 'ssh-test',
description: 'test',
max_movements: 1,
initial_movement: movements[0]?.name ?? 'm1',
movements,
};
}
it('passes when no SSH tools and no allowlist', () => {
const piece = makePiece([makeMovement({ allowed_tools: ['Read'] })]);
expect(validateAllowedSshConnections(piece)).toEqual([]);
expect(() => validatePieceDef(piece)).not.toThrow();
});
it('passes when SSH tool present and allowlist declared (UUID)', () => {
const piece = makePiece([
makeMovement({
allowed_tools: ['SshExec', 'Read'],
allowed_ssh_connections: ['6f9619ff-8b86-d011-b42d-00c04fc964ff'],
}),
]);
expect(validateAllowedSshConnections(piece)).toEqual([]);
});
it('passes when SSH tool present and allowlist declared (empty array = explicit deny)', () => {
const piece = makePiece([
makeMovement({ allowed_tools: ['SshExec'], allowed_ssh_connections: [] }),
]);
expect(validateAllowedSshConnections(piece)).toEqual([]);
});
it('passes when allowlist is wildcard ["*"]', () => {
const piece = makePiece([
makeMovement({ allowed_tools: ['SshUpload'], allowed_ssh_connections: ['*'] }),
]);
expect(validateAllowedSshConnections(piece)).toEqual([]);
});
it('rejects when SSH tool present but allowlist missing', () => {
const piece = makePiece([makeMovement({ allowed_tools: ['SshExec'] })]);
const errors = validateAllowedSshConnections(piece);
expect(errors).toHaveLength(1);
expect(errors[0]).toContain('uses SSH tool(s) but allowed_ssh_connections is not declared');
expect(() => validatePieceDef(piece)).toThrow(/allowed_ssh_connections/);
});
it('rejects SshUpload without allowlist', () => {
const piece = makePiece([makeMovement({ allowed_tools: ['SshUpload'] })]);
expect(validateAllowedSshConnections(piece)).toHaveLength(1);
});
it('rejects SshDownload without allowlist', () => {
const piece = makePiece([makeMovement({ allowed_tools: ['SshDownload'] })]);
expect(validateAllowedSshConnections(piece)).toHaveLength(1);
});
it('rejects non-array allowlist', () => {
const piece = makePiece([
makeMovement({
allowed_tools: ['SshExec'],
allowed_ssh_connections: 'not-an-array' as unknown as string[],
}),
]);
const errors = validateAllowedSshConnections(piece);
expect(errors[0]).toMatch(/must be an array/);
});
it('rejects non-string entries', () => {
const piece = makePiece([
makeMovement({
allowed_tools: ['SshExec'],
allowed_ssh_connections: [123 as unknown as string],
}),
]);
const errors = validateAllowedSshConnections(piece);
expect(errors[0]).toMatch(/must be a string/);
});
it('rejects entries that are neither wildcard nor valid id format', () => {
const piece = makePiece([
makeMovement({
allowed_tools: ['SshExec'],
allowed_ssh_connections: ['short'],
}),
]);
const errors = validateAllowedSshConnections(piece);
expect(errors[0]).toMatch(/must be '\*' or a lowercase hex/);
});
it('rejects uppercase / non-hex characters in ids', () => {
const piece = makePiece([
makeMovement({
allowed_tools: ['SshExec'],
allowed_ssh_connections: ['ZZZZZZZZ-not-hex'],
}),
]);
expect(validateAllowedSshConnections(piece)).toHaveLength(1);
});
it('allowlist without SSH tool is allowed (no-op, future-proofing)', () => {
const piece = makePiece([
makeMovement({
allowed_tools: ['Read'],
allowed_ssh_connections: ['6f9619ff-8b86-d011-b42d-00c04fc964ff'],
}),
]);
expect(validateAllowedSshConnections(piece)).toEqual([]);
});
it('reports offenders across multiple movements', () => {
const piece = makePiece([
makeMovement({ name: 'm1', allowed_tools: ['SshExec'] }),
makeMovement({ name: 'm2', allowed_tools: ['Read'] }),
makeMovement({ name: 'm3', allowed_tools: ['SshDownload'], allowed_ssh_connections: ['*'] }),
makeMovement({ name: 'm4', allowed_tools: ['SshUpload'], allowed_ssh_connections: ['BAD_ID'] }),
]);
const errors = validateAllowedSshConnections(piece);
expect(errors).toHaveLength(2);
expect(errors[0]).toContain('movement="m1"');
expect(errors[1]).toContain('movement="m4"');
});
it('validatePieceDef composes error message with piece name', () => {
const piece = makePiece([makeMovement({ allowed_tools: ['SshExec'] })]);
expect(() => validatePieceDef(piece)).toThrow(/Piece "ssh-test" has invalid allowed_ssh_connections/);
});
});
// --- Task 1: loadPiece multi-dir support ---
describe('loadPiece multi-dir (string | string[])', () => {
it('resolves from a list of custom dirs (per-user wins over builtin name miss)', () => {
const dirA = mkdtempSync(join(tmpdir(), 'pa-')); // empty
const dirB = mkdtempSync(join(tmpdir(), 'pb-'));
writeFileSync(
join(dirB, 'mycustom.yaml'),
`name: mycustom\ndescription: d\nmax_movements: 1\ninitial_movement: go\nmovements:\n - name: go\n edit: false\n persona: w\n instruction: x\n allowed_tools: []\n rules: []\n default_next: COMPLETE\n`,
);
// array form: searches dirA then dirB then builtin
const p = loadPiece('mycustom', 'pieces', [dirA, dirB]);
expect(p.name).toBe('mycustom');
// builtin still resolvable when not in any custom dir
expect(() => loadPiece('chat', 'pieces', [dirA, dirB])).not.toThrow();
rmSync(dirA, { recursive: true });
rmSync(dirB, { recursive: true });
});
it('first dir wins when same name appears in two custom dirs', () => {
const dirA = mkdtempSync(join(tmpdir(), 'pa-'));
const dirB = mkdtempSync(join(tmpdir(), 'pb-'));
writeFileSync(
join(dirA, 'dup.yaml'),
`name: dup\ndescription: from-a\nmax_movements: 1\ninitial_movement: go\nmovements:\n - name: go\n edit: false\n persona: w\n instruction: x\n allowed_tools: []\n rules: []\n default_next: COMPLETE\n`,
);
writeFileSync(
join(dirB, 'dup.yaml'),
`name: dup\ndescription: from-b\nmax_movements: 1\ninitial_movement: go\nmovements:\n - name: go\n edit: false\n persona: w\n instruction: x\n allowed_tools: []\n rules: []\n default_next: COMPLETE\n`,
);
const p = loadPiece('dup', 'pieces', [dirA, dirB]);
expect(p.description).toBe('from-a');
rmSync(dirA, { recursive: true });
rmSync(dirB, { recursive: true });
});
it('string form still works (backward compat)', () => {
const dir = mkdtempSync(join(tmpdir(), 'pc-'));
writeFileSync(
join(dir, 'strcompat.yaml'),
`name: strcompat\ndescription: str\nmax_movements: 1\ninitial_movement: go\nmovements:\n - name: go\n edit: false\n persona: w\n instruction: x\n allowed_tools: []\n rules: []\n default_next: COMPLETE\n`,
);
const p = loadPiece('strcompat', 'pieces', dir);
expect(p.name).toBe('strcompat');
rmSync(dir, { recursive: true });
});
});