import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import type { LLMEvent, ToolDef } from '../llm/openai-compat.js';
import type { ToolContext } from './tools/index.js';
import { stripThinkingTokens } from './agent-loop.js';
describe('stripThinkingTokens', () => {
it('strips ... blocks', () => {
expect(stripThinkingTokens('reasoning hereactual output')).toBe('actual output');
});
it('strips <|thinking|>...|thinking|> blocks', () => {
expect(stripThinkingTokens('<|thinking|>deep thought<|/thinking|>result')).toBe('result');
});
it('strips gemma4 thought pattern', () => {
expect(stripThinkingTokens('thought\n')).toBe('');
expect(stripThinkingTokens('thought hello')).toBe('hello');
});
it('strips ... blocks', () => {
expect(stripThinkingTokens('internal reasoningvisible output')).toBe('visible output');
});
it('returns normal text unchanged', () => {
expect(stripThinkingTokens('hello world')).toBe('hello world');
});
it('handles multiline thinking blocks', () => {
expect(stripThinkingTokens('\nline1\nline2\noutput')).toBe('output');
});
});
const { executeToolMock, getToolDefsMock } = vi.hoisted(() => ({
executeToolMock: vi.fn(),
getToolDefsMock: vi.fn(),
}));
vi.mock('./tools/index.js', () => ({
executeTool: executeToolMock,
getToolDefs: getToolDefsMock,
}));
import { executeMovement, type Movement } from './agent-loop.js';
import { ToolResultCache } from './context/tool-result-cache.js';
import { WorkspaceMemory } from './context/workspace-memory.js';
function makeMovement(allowedTools: string[]): Movement {
return {
name: 'execute',
edit: true,
persona: 'worker',
instruction: 'Do the work.',
allowedTools,
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
}
function makeToolDefs(names: string[]): ToolDef[] {
return names.map((name) => ({
type: 'function',
function: {
name,
description: name,
parameters: { type: 'object', properties: {}, required: [] },
},
}));
}
function makeContext(): ToolContext {
return {
workspacePath: '/tmp/agent-loop-test',
editAllowed: true,
};
}
function deferred(): { promise: Promise; resolve: (value: T) => void } {
let resolve!: (value: T) => void;
const promise = new Promise((res) => {
resolve = res;
});
return { promise, resolve };
}
async function waitFor(predicate: () => boolean, timeoutMs: number = 250): Promise {
const start = Date.now();
while (!predicate()) {
if (Date.now() - start > timeoutMs) {
throw new Error('Timed out waiting for condition');
}
await new Promise((resolve) => setTimeout(resolve, 0));
}
}
class FakeClient {
private index = 0;
readonly calls: Array<{ messages: unknown; tools?: unknown }> = [];
constructor(private readonly responses: LLMEvent[][]) {}
async *chat(messages: unknown, tools?: unknown, _signal?: AbortSignal): AsyncGenerator {
this.calls.push({ messages, tools });
const response = this.responses[this.index++] ?? [];
for (const event of response) {
yield event;
}
}
}
describe('executeMovement parallel tool execution', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('runs consecutive safe tool calls in parallel', async () => {
const readGate = deferred<{ output: string; isError: boolean }>();
const globGate = deferred<{ output: string; isError: boolean }>();
const started: string[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
executeToolMock.mockImplementation(async (name: string) => {
started.push(name);
if (name === 'Read') return readGate.promise;
if (name === 'Glob') return globGate.promise;
return { output: 'ok', isError: false };
});
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());
await waitFor(() => started.length === 2);
expect(started).toEqual(['Read', 'Glob']);
readGate.resolve({ output: 'read ok', isError: false });
globGate.resolve({ output: 'glob ok', isError: false });
const result = await runPromise;
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(2);
});
it('keeps side-effecting tools sequential as a barrier', async () => {
const readGate = deferred<{ output: string; isError: boolean }>();
const writeGate = deferred<{ output: string; isError: boolean }>();
const started: string[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Write']));
executeToolMock.mockImplementation(async (name: string) => {
started.push(name);
if (name === 'Read') return readGate.promise;
if (name === 'Write') return writeGate.promise;
return { output: 'ok', isError: false };
});
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'tool_use', id: 'write-1', name: 'Write', input: { file_path: 'output/a.txt', content: 'x' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const runPromise = executeMovement(makeMovement(['Read', 'Write']), 'task', client as never, makeContext());
await waitFor(() => started.length === 1);
expect(started).toEqual(['Read']);
readGate.resolve({ output: 'read ok', isError: false });
await waitFor(() => started.length === 2);
expect(started).toEqual(['Read', 'Write']);
writeGate.resolve({ output: 'write ok', isError: false });
const result = await runPromise;
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(2);
});
it('executes regular tools before transition even if transition appears mid-batch', async () => {
const readGate = deferred<{ output: string; isError: boolean }>();
const globGate = deferred<{ output: string; isError: boolean }>();
const started: string[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
executeToolMock.mockImplementation(async (name: string) => {
started.push(name);
if (name === 'Read') return readGate.promise;
if (name === 'Glob') return globGate.promise;
return { output: 'glob ok', isError: false };
});
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
{ type: 'done' },
],
]);
const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());
await waitFor(() => started.length === 2);
expect(started).toEqual(['Read', 'Glob']);
readGate.resolve({ output: 'read ok', isError: false });
globGate.resolve({ output: 'glob ok', isError: false });
const result = await runPromise;
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(2);
});
it('emits text, preview, and tool callbacks in order', async () => {
const textChunks: string[] = [];
const previews: Array<{ movement: string; preview: string }> = [];
const toolUses: Array<{ name: string; input: Record }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
const client = new FakeClient([
[
{ type: 'text', text: 'first ' },
{ type: 'text', text: 'second' },
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onText: (text) => textChunks.push(text),
onTextPreview: (movementName, preview) => previews.push({ movement: movementName, preview }),
onToolUse: (toolName, input) => toolUses.push({ name: toolName, input }),
},
},
);
expect(result.next).toBe('COMPLETE');
expect(textChunks).toEqual(['first ', 'second']);
expect(previews).toEqual([{ movement: 'execute', preview: 'first second' }]);
expect(toolUses).toContainEqual({ name: 'Read', input: { file_path: 'input/a.txt' } });
});
it('fires onToolResult callback after tool execution', async () => {
const toolResults: Array<{ name: string; isError: boolean }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onToolResult: (name, info) => toolResults.push({ name, isError: info.isError }),
},
},
);
expect(toolResults).toContainEqual({ name: 'Read', isError: false });
});
it('fires onMemoryCheckpoint callback after regular tool use', async () => {
const checkpoints: number[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onMemoryCheckpoint: (count) => checkpoints.push(count),
},
},
);
expect(checkpoints).toEqual([1]);
});
it('triggers force_transition when context manager signals exhaustion', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({
thresholds: [{ ratio: 0.005, action: 'force_transition' }],
limitTokens: 10_000,
});
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done', usage: { prompt_tokens: 80, completion_tokens: 10 } },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
expect(result.output).toContain('Context limit reached');
});
it('fires onContextAction callback when context threshold crossed', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({
thresholds: [{ ratio: 0.005, action: 'warn' }],
limitTokens: 10_000,
});
const contextActions: Array<{ type: string }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done', usage: { prompt_tokens: 60, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onContextAction: (action) => contextActions.push({ type: action.type }),
},
contextManager: cm,
},
);
expect(contextActions).toContainEqual({ type: 'warn' });
});
it('fires onContextUpdate callback after each LLM usage update', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 100_000 });
const updates: Array<{ promptTokens: number; limitTokens: number }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done', usage: { prompt_tokens: 1234, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onContextUpdate: (payload) => updates.push(payload),
},
contextManager: cm,
},
);
expect(updates.length).toBeGreaterThanOrEqual(1);
expect(updates).toContainEqual({ promptTokens: 0, limitTokens: 100_000 });
expect(updates).toContainEqual({ promptTokens: 1234, limitTokens: 100_000 });
});
it('compacts oversized tool results before the next LLM request', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 20_000 });
const hugeBase64 = `
`;
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: hugeBase64, isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/page.html' } },
{ type: 'done', usage: { prompt_tokens: 1000, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done', usage: { prompt_tokens: 2000, completion_tokens: 10 } },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
expect(client.calls).toHaveLength(2);
const secondMessages = client.calls[1]!.messages as Array<{ role: string; content?: string }>;
const toolMessage = secondMessages.find((message) => message.role === 'tool');
expect(toolMessage?.content).toContain('[Tool result omitted before LLM request]');
expect(toolMessage?.content).not.toContain(hugeBase64);
// The agent should also receive a user-facing instruction to retry with a
// narrower scope, instead of silently dropping content.
const feedbackMessage = secondMessages.find(
(message) => message.role === 'user' && message.content?.includes('LLM コンテキストに入れられませんでした'),
);
expect(feedbackMessage).toBeDefined();
expect(feedbackMessage?.content).toMatch(/Read\(offset\/limit\)|Grep/);
});
it('dedupes duplicate file reads before compacting tool results', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 40_000 });
// 80% guard => 32_000 tokens. Each large ASCII read is roughly 20k tokens.
// Two duplicate reads exceed the guard without dedup; after dedup, only
// the latest read survives and the request fits.
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const largeContent = 'X'.repeat(72_000);
executeToolMock.mockResolvedValue({ output: largeContent, isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: '/dup.ts' } },
{ type: 'done', usage: { prompt_tokens: 500, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: '/dup.ts' } },
{ type: 'done', usage: { prompt_tokens: 18_000, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
expect(client.calls.length).toBe(3);
// The 3rd LLM call should see the older Read replaced with a placeholder
const thirdCallMessages = client.calls[2]!.messages as Array<{ role: string; content?: string }>;
const placeholderMessages = thirdCallMessages.filter(
(m) => m.role === 'tool' && typeof m.content === 'string' && m.content.includes('Duplicate Read of /dup.ts'),
);
expect(placeholderMessages.length).toBe(1);
// The latest Read should still carry the full content
const fullReadMessages = thirdCallMessages.filter(
(m) => m.role === 'tool' && typeof m.content === 'string' && m.content === largeContent,
);
expect(fullReadMessages.length).toBe(1);
});
it('summarizes old turns when dedup+compact alone cannot fit prompt', async () => {
const { ContextManager } = await import('./context-manager.js');
const { SUMMARY_MARKER_PREFIX } = await import('./context/history-compactor.js');
// Tight context, distinct Bash outputs (so dedup cannot help) each below
// LARGE_TOOL_RESULT_TOKENS (so compaction cannot help either) — only
// history summarization can keep the conversation going.
// Phase 6c expanded the system prompt by ~200 chars (memory_update
// guidance + new tool definition); we bump the test limit a bit so
// summarization still fires on the 5th Bash turn rather than the 4th,
// matching the FakeClient slot layout below.
const cm = new ContextManager({ limitTokens: 35_000 });
getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
executeToolMock.mockResolvedValue({ output: 'B'.repeat(18_000), isError: false });
// FakeClient.chat() walks a single response list, so the isolated
// summarization LLM call (runIsolatedLlm) consumes the next slot. Slot 5
// is the summary response; slot 6 is the post-summary transition.
const client = new FakeClient([
[{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo 1' } }, { type: 'done', usage: { prompt_tokens: 1_000, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b2', name: 'Bash', input: { command: 'echo 2' } }, { type: 'done', usage: { prompt_tokens: 5_500, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b3', name: 'Bash', input: { command: 'echo 3' } }, { type: 'done', usage: { prompt_tokens: 10_000, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b4', name: 'Bash', input: { command: 'echo 4' } }, { type: 'done', usage: { prompt_tokens: 15_000, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b5', name: 'Bash', input: { command: 'echo 5' } }, { type: 'done', usage: { prompt_tokens: 20_000, completion_tokens: 5 } }],
[{ type: 'text', text: '## ゴール\nrun bash commands\n## 進捗\nDone: 5 commands' }, { type: 'done' }],
[
{ type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Bash']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
const finalMessages = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: string }>;
const markerMessages = finalMessages.filter(
(m) => m.role === 'user' && typeof m.content === 'string' && m.content.startsWith(SUMMARY_MARKER_PREFIX),
);
// Phase 6c expanded the system prompt; the exact iteration where
// summarization fires shifted slightly. The invariant we care about
// is "summarization fired at least once before completion".
expect(markerMessages.length).toBeGreaterThanOrEqual(1);
});
it('aborts when initial prompt is oversized and defaultNext is terminal', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 1_000 });
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
// The guard will fail before the main LLM is called, but the
// last-resort summary path uses runIsolatedLlm (also via FakeClient),
// so we provide one response for that summary call.
const client = new FakeClient([
[{ type: 'text', text: '### Status\nstuck on overflow' }, { type: 'done' }],
]);
const result = await executeMovement(
makeMovement(['Read']),
'x'.repeat(10_000),
client as never,
makeContext(),
{ contextManager: cm },
);
// A terminal fallback would report a false completion, so overflow aborts.
expect(result.next).toBe('ABORT');
expect(result.abortCode).toBe('context_overflow');
expect(result.output).toContain('LLM request blocked before send');
// Only the isolated summary call should have happened — no main LLM call
expect(executeToolMock).not.toHaveBeenCalled();
});
it('falls back to ABORT when oversized prompt has no defaultNext to force-transition to', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 1_000 });
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[{ type: 'text', text: 'should not be called' }, { type: 'done' }],
]);
// Movement without defaultNext
const movement: Movement = {
name: 'execute',
edit: true,
persona: 'worker',
instruction: 'Do the work.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
};
const result = await executeMovement(
movement,
'x'.repeat(10_000),
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('ABORT');
expect(result.output).toContain('LLM request blocked before send');
expect(executeToolMock).not.toHaveBeenCalled();
});
it('returns ABORT immediately when cancelSignal is already aborted', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
]);
const controller = new AbortController();
controller.abort();
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ cancelSignal: controller.signal },
);
expect(result.next).toBe('ABORT');
expect(result.output).toContain('cancelled');
expect(executeToolMock).not.toHaveBeenCalled();
});
it('aborts after text-only responses without transition calls', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
// LLM returns text-only responses (no tool calls, no transition) 4 times
// MAX_TEXT_ONLY_RETRIES is 3, so after 3 text-only responses it should ABORT
const client = new FakeClient([
[{ type: 'text', text: 'I will do the work' }, { type: 'done' }],
[{ type: 'text', text: 'Still working on it' }, { type: 'done' }],
[{ type: 'text', text: 'Almost done' }, { type: 'done' }],
// Should not reach this
[{ type: 'text', text: 'Never reached' }, { type: 'done' }],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
);
expect(result.next).toBe('ABORT');
expect(executeToolMock).not.toHaveBeenCalled();
});
it('resets text-only counter when tool calls happen in between', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// Alternate: text-only → tool → text-only → tool → text-only → tool → complete
// Without the fix, the 3 text-only responses would trigger abort.
// With the fix, each tool call resets the counter so it never reaches 3 consecutive.
const client = new FakeClient([
[{ type: 'text', text: 'Thinking about the answer...' }, { type: 'done' }],
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
[{ type: 'text', text: 'Found some info, analyzing...' }, { type: 'done' }],
[
{ type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'b.txt' } },
{ type: 'done' },
],
[{ type: 'text', text: 'Almost done with analysis' }, { type: 'done' }],
[
{ type: 'tool_use', id: 'complete-1', name: 'complete', input: { status: 'success', result: 'Done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
);
expect(result.next).not.toBe('ABORT');
expect(executeToolMock).toHaveBeenCalled();
});
it('aborts when maxIterations is exceeded', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// LLM keeps using tools but never calls transition
const toolResponses = Array.from({ length: 4 }, () => [
{ type: 'tool_use' as const, id: `read-${Math.random()}`, name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' as const },
]);
const client = new FakeClient(toolResponses);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ maxIterations: 2 },
);
expect(result.next).toBe('ABORT');
expect(result.output).toContain('iteration');
});
});
describe('executeMovement cross-movement Read cache', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('returns a cached Read result when a later movement reads the same file', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const fileContent = 'const answer = 42;\n';
executeToolMock.mockImplementation(async (name: string, input: Record) => {
if (name === 'Read' && input['file_path'] === 'foo.ts') {
return { output: fileContent, isError: false };
}
return { output: 'unexpected', isError: true };
});
const cache = new ToolResultCache();
const movementA: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Look at foo.ts.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'foo.ts' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'transition', input: { next_step: 'plan', summary: 'foo.ts read' } },
{ type: 'done' },
],
]);
const resultA = await executeMovement(movementA, 'task', clientA as never, makeContext(), {
toolResultCache: cache,
});
expect(resultA.next).toBe('plan');
expect(executeToolMock).toHaveBeenCalledTimes(1);
expect(cache.size()).toBe(1);
const observedToolResults: string[] = [];
const movementB: Movement = {
name: 'plan',
edit: false,
persona: 'planner',
instruction: 'Plan based on foo.ts.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[
{ type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'foo.ts' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-2', name: 'complete', input: { status: 'success', result: 'used cached read' } },
{ type: 'done' },
],
]);
const resultB = await executeMovement(movementB, 'task', clientB as never, makeContext(), {
toolResultCache: cache,
callbacks: {
onToolResult: (_name, info) => observedToolResults.push(info.result),
},
});
expect(resultB.next).toBe('COMPLETE');
// executeTool was NOT invoked again — the cache short-circuited the second Read.
expect(executeToolMock).toHaveBeenCalledTimes(1);
// The LLM saw a cached-tagged result, not the raw content.
expect(observedToolResults.some((r) => r.startsWith('[cached: Read foo.ts from movement investigate at '))).toBe(true);
expect(observedToolResults.some((r) => r.endsWith(fileContent))).toBe(true);
});
it('does not cache error results', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ENOENT: no such file', isError: true });
const cache = new ToolResultCache();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Look at missing.ts.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'missing.ts' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'tried' } },
{ type: 'done' },
],
]);
await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0);
});
it('skips caching tools outside the cacheable allowlist (Bash etc.)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
executeToolMock.mockResolvedValue({ output: 'shell ok', isError: false });
const cache = new ToolResultCache();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Run.',
allowedTools: ['Bash'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[
{ type: 'tool_use', id: 'b-1', name: 'Bash', input: { command: 'echo hi' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'ran' } },
{ type: 'done' },
],
]);
await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0);
});
});
describe('executeMovement Phase 2 cache invalidation', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('invalidates cached Read after a successful Edit on the same path', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
const reads: number[] = [];
let version = 1;
executeToolMock.mockImplementation(async (name: string, input: Record) => {
if (name === 'Read' && input['file_path'] === 'foo.ts') {
reads.push(version);
return { output: `version=${version}`, isError: false };
}
if (name === 'Edit' && input['file_path'] === 'foo.ts') {
version++;
return { output: 'edited', isError: false };
}
return { output: 'unexpected', isError: true };
});
const cache = new ToolResultCache();
const movementA: Movement = {
name: 'investigate',
edit: true,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 'read' } }, { type: 'done' }],
]);
await executeMovement(movementA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(1);
const movementB: Movement = {
name: 'execute',
edit: true,
persona: 'p',
instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'verify' }],
defaultNext: 'verify',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'transition', input: { next_step: 'verify', summary: 'edited' } }, { type: 'done' }],
]);
await executeMovement(movementB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0); // Edit evicted the Read entry
const observed: string[] = [];
const movementC: Movement = {
name: 'verify',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientC = new FakeClient([
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't3', name: 'complete', input: { status: 'success', result: 're-read' } }, { type: 'done' }],
]);
await executeMovement(movementC, 'task', clientC as never, makeContext(), {
toolResultCache: cache,
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
});
// Re-read happened (no cache hit) and saw the post-edit version.
expect(reads).toEqual([1, 2]);
expect(observed.some((r) => r === 'version=2')).toBe(true);
expect(observed.some((r) => r.startsWith('[cached:'))).toBe(false);
});
it('invalidates only the affected path; other cached files survive', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
const callLog: Array<{ name: string; path?: string }> = [];
executeToolMock.mockImplementation(async (name: string, input: Record) => {
callLog.push({ name, path: input['file_path'] as string | undefined });
if (name === 'Read') return { output: `${input['file_path']} body`, isError: false };
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(2);
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit', 'Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const observed: string[] = [];
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[
{ type: 'tool_use', id: 'r3', name: 'Read', input: { file_path: 'foo.ts' } },
{ type: 'tool_use', id: 'r4', name: 'Read', input: { file_path: 'bar.ts' } },
{ type: 'done' },
],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), {
toolResultCache: cache,
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
});
const fooReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'foo.ts').length;
const barReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'bar.ts').length;
expect(fooReadCount).toBe(2); // re-read after Edit
expect(barReadCount).toBe(1); // bar still cached
expect(observed.some((r) => r.startsWith('[cached: Read bar.ts'))).toBe(true);
expect(observed.some((r) => r.startsWith('[cached: Read foo.ts'))).toBe(false);
});
it('Bash invalidates every file-derived entry conservatively', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Bash']));
let readCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') {
readCalls++;
return { output: `read ${readCalls}`, isError: false };
}
if (name === 'Bash') return { output: 'shell ok', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(2);
const movB: Movement = {
name: 'execute', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Bash'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo hi' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0); // every Read evicted
});
it('does NOT invalidate when Edit/Write/Bash returned an error', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
let editAttempts = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') return { output: 'body', isError: false };
if (name === 'Edit') {
editAttempts++;
return { output: 'permission denied', isError: true };
}
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(1);
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(editAttempts).toBe(1);
expect(cache.size()).toBe(1); // failed Edit kept the cache intact
});
});
describe('executeMovement Phase 3 WorkspaceMemory', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('persists memory_update from one movement and exposes it in the next system prompt', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const memory = new WorkspaceMemory();
const movementA: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Find the bug.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[
{
type: 'tool_use',
id: 't1',
name: 'transition',
input: {
next_step: 'plan',
summary: 'investigated',
memory_update: {
facts: [
{ claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'], confidence: 'high' },
],
decisions: [{ text: 'patch foo.ts directly', evidence_paths: ['foo.ts'] }],
do_not_repeat: ['re-read foo.ts unless evidence breaks'],
},
},
},
{ type: 'done' },
],
]);
await executeMovement(movementA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
expect(memory.size().decisions).toBe(1);
expect(memory.size().doNotRepeat).toBe(1);
const movementB: Movement = {
name: 'plan',
edit: false,
persona: 'planner',
instruction: 'Plan the fix.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'planned' } }, { type: 'done' }],
]);
await executeMovement(movementB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });
const systemMsg = clientB.calls[0]?.messages as Array<{ role: string; content: string }>;
expect(systemMsg[0]!.role).toBe('system');
expect(systemMsg[0]!.content).toContain('## これまでに蓄積した観測');
expect(systemMsg[0]!.content).toContain('foo.ts uses bar()');
expect(systemMsg[0]!.content).toContain('patch foo.ts directly');
expect(systemMsg[0]!.content).toContain('re-read foo.ts unless evidence breaks');
expect(systemMsg[0]!.content).toContain('memory は再調査禁止の根拠ではなく');
});
it('omits memory section when memory is empty', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
expect(systemMsg[0]!.content).not.toContain('## これまでに蓄積した観測');
});
it('Edit invalidates a fact whose evidence_paths includes the edited file', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') return { output: 'body', isError: false };
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const memory = new WorkspaceMemory();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[
{
type: 'tool_use',
id: 't1',
name: 'transition',
input: {
next_step: 'execute',
summary: 'investigated',
memory_update: {
facts: [
{ claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'] },
{ claim: 'unrelated truth', evidence_paths: ['bar.ts'] },
],
},
},
},
{ type: 'done' },
],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });
expect(memory.snapshot().facts).toHaveLength(2);
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'edited' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });
const snapshotAfter = memory.snapshot();
expect(snapshotAfter.facts).toHaveLength(1);
expect(snapshotAfter.facts[0]!.claim).toBe('unrelated truth');
});
it('still works when piece submits no memory_update (backward compat)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const memory = new WorkspaceMemory();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'just summary' } }, { type: 'done' }],
]);
const result = await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(result.next).toBe('COMPLETE');
expect(memory.size().facts).toBe(0);
});
});
describe('executeMovement Phase 4 cache extension', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('caches Grep results and serves them to a later movement', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep']));
let grepCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Grep') {
grepCalls++;
return { output: 'matches in src/foo.ts:1\n', isError: false };
}
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Grep'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(grepCalls).toBe(1);
const observed: string[] = [];
const movB: Movement = {
name: 'plan', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Grep'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), {
toolResultCache: cache,
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
});
expect(grepCalls).toBe(1);
expect(observed.some((r) => r.startsWith('[cached: Grep foo'))).toBe(true);
});
it('Edit invalidates ALL search entries (Grep) regardless of scope', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep', 'Edit']));
let grepCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Grep') {
grepCalls++;
return { output: `match v${grepCalls}`, isError: false };
}
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Grep'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit', 'Grep'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'unrelated.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(grepCalls).toBe(2);
});
it('caches WebFetch and survives Edit invalidation', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['WebFetch', 'Edit']));
let fetchCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'WebFetch') {
fetchCalls++;
return { output: 'page body', isError: false };
}
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['WebFetch'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'w1', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit', 'WebFetch'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'w2', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(fetchCalls).toBe(1);
});
it('caches Office tool results (ReadPdf) keyed by path + range', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['ReadPdf']));
let pdfCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'ReadPdf') {
pdfCalls++;
return { output: 'pdf body', isError: false };
}
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const mov: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['ReadPdf'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'p1', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(mov, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(pdfCalls).toBe(1);
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'p2', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'p3', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '4-6' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(mov, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(pdfCalls).toBe(2);
});
});
describe('buildSystemPrompt checklist guidance', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('emits a "進捗管理" section with strengthened CreateChecklist guidance', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const movement: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movement, 'task', client as never, makeContext());
const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
expect(systemMsg[0]!.role).toBe('system');
expect(systemMsg[0]!.content).toContain('## 進捗管理');
expect(systemMsg[0]!.content).toContain('CreateChecklist');
expect(systemMsg[0]!.content).toContain('全 piece で常時利用可能');
// Stronger language (option A): explicit "3 個以上のツール" criterion +
// "判断に迷ったら作る" + 2 回目以降への明示注意。
expect(systemMsg[0]!.content).toContain('3 個以上のツール呼び出し');
expect(systemMsg[0]!.content).toContain('判断に迷ったら作る');
expect(systemMsg[0]!.content).toContain('2 回目以降のやり取り');
});
});
describe('checklist watchdog (option B)', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('injects a one-shot reminder after 5 iterations without any checklist tool', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// 6 iterations of plain Read → 1 iteration with complete. The watchdog
// should fire on iteration 5 (0-indexed).
const responses: Array> = [];
for (let i = 0; i < 6; i++) {
responses.push([
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
{ type: 'done' },
]);
}
responses.push([
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
]);
const client = new FakeClient(responses as never);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
// Find any user-role message containing the watchdog tag.
const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
const watchdogMessages = allMessagesAtFinalCall.filter(
(m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
);
expect(watchdogMessages.length).toBe(1);
});
it('does NOT fire when CreateChecklist is called early', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'CreateChecklist']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// CreateChecklist on iteration 0, then several Read, then complete.
const responses: Array> = [
[{ type: 'tool_use', id: 'cc', name: 'CreateChecklist', input: { name: 'plan', items: [] } }, { type: 'done' }],
];
for (let i = 0; i < 7; i++) {
responses.push([
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
{ type: 'done' },
]);
}
responses.push([
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
]);
const client = new FakeClient(responses as never);
await executeMovement(makeMovement(['Read', 'CreateChecklist']), 'task', client as never, makeContext());
const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
const watchdogMessages = allMessagesAtFinalCall.filter(
(m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
);
expect(watchdogMessages.length).toBe(0);
});
});
// ============================================================
// Phase 6a — `complete` tool (§7.1–7.7 of design plan)
// ============================================================
describe('Phase 6a: complete tool — §7.1 native basics', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('success status with result becomes the movement output (no chatter contamination)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'text', text: "I'll figure this out first" }, // chatter to be ignored
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'final answer' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('COMPLETE');
expect(result.output).toBe('final answer');
expect(result.output).not.toContain("I'll figure this out first");
});
it('aborted status routes via ABORT next', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted', abort_reason: 'tool unavailable' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('ABORT');
expect(result.output).toBe('tool unavailable');
});
it('needs_user_input routes via ASK next', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{
type: 'tool_use', id: 'c1', name: 'complete',
input: { status: 'needs_user_input', missing_info: 'which dataset?', why_no_default: 'multiple options' },
},
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('ASK');
expect(result.output).toBe('which dataset?');
});
it('rejects success with empty result and forces retry (no accumulatedText fallback for native)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'text', text: 'this would have been the fallback' },
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'real answer' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('COMPLETE');
expect(result.output).toBe('real answer');
// The fallback chatter must NOT have leaked into the output via retry.
expect(result.output).not.toContain('this would have been the fallback');
});
it('rejects aborted without abort_reason and forces retry', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'aborted', abort_reason: 'real reason' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('ABORT');
expect(result.output).toBe('real reason');
});
});
describe('Phase 6a: complete tool — §7.2 precedence rules (§2.5)', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('invalid native complete forces retry (no fallback)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
// First iteration: invalid (empty result) — must retry
[
{ type: 'text', text: 'this would have been the fallback' },
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'done' },
],
// Second iteration: LLM corrects with valid native
[
{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'native corrected' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.output).toBe('native corrected');
// accumulatedText must NOT have leaked into the output via retry
expect(result.output).not.toContain('fallback');
});
it('two native completes with conflicting args → retry', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'A' } },
{ type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'B' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'consistent' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.output).toBe('consistent');
});
it('two native completes with identical args → first one used', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'same answer' } },
{ type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'same answer' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.output).toBe('same answer');
});
});
describe('Phase 6a: complete tool — §7.4 regressions', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('non-terminal transition (movement-to-movement) still works', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const movement: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 'investigated' } }, { type: 'done' }],
]);
const result = await executeMovement(movement, 'task', client as never, makeContext());
expect(result.next).toBe('plan');
expect(result.output).toBe('investigated');
});
it('transition({next_step:"COMPLETE"}) is rejected (Phase 6b)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
// FakeClient bypasses tool-schema validation, but the engine's
// validateTransition still rejects unknown next values, so an LLM
// emitting a legacy terminal transition gets a tool-result error and
// must retry. This locks in the no-shim behavior.
const client = new FakeClient([
[{
type: 'tool_use', id: 't1', name: 'transition',
input: { next_step: 'COMPLETE', summary: 'should not be accepted' },
}, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'corrected' } }, { type: 'done' }],
]);
const movement: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
// No COMPLETE in rules — legacy terminal cannot succeed.
rules: [{ condition: 'fix needed', next: 'fix' }],
};
const result = await executeMovement(movement, 'task', client as never, makeContext());
expect(result.output).toBe('corrected');
});
});
describe('Phase 6a: complete tool — §7.5 memory_update behavior', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('memory_update inside complete is applied to WorkspaceMemory (success)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[{
type: 'tool_use', id: 'c1', name: 'complete',
input: {
status: 'success',
result: 'done',
memory_update: { facts: [{ claim: 'X is Y', confidence: 'high' }] },
},
}, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
});
it('memory_update is NOT applied when complete is invalid and retried', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
// Invalid first attempt — memory_update must NOT commit
[{
type: 'tool_use', id: 'c1', name: 'complete',
input: {
status: 'success',
result: '',
memory_update: { facts: [{ claim: 'should not commit', confidence: 'high' }] },
},
}, { type: 'done' }],
// Valid retry — different fact, must commit only this one
[{
type: 'tool_use', id: 'c2', name: 'complete',
input: {
status: 'success',
result: 'fixed',
memory_update: { facts: [{ claim: 'committed once', confidence: 'high' }] },
},
}, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
expect(memory.snapshot().facts[0]!.claim).toBe('committed once');
});
it('memory_update applied for status=aborted', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[{
type: 'tool_use', id: 'c1', name: 'complete',
input: {
status: 'aborted',
abort_reason: 'tool unavailable',
memory_update: { facts: [{ claim: 'lesson from failure', confidence: 'medium' }] },
},
}, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
});
});
describe('Phase 6a: complete tool — §7.7 v3 critical (Conditional Go) tests', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('all tool_use ids get a tool_result on retry (conversation history integrity)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'fail-1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'tool_use', id: 'ignored-1', name: 'complete', input: { status: 'success', result: 'X' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'good', name: 'complete', input: { status: 'success', result: 'recovered' } },
{ type: 'done' },
],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
// Inspect the messages sent on the second call — every prior tool_use must have a tool_result.
const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; tool_call_id?: string }>;
const toolResultIds = secondCallMessages.filter((m) => m.role === 'tool').map((m) => m.tool_call_id);
expect(toolResultIds).toContain('fail-1');
expect(toolResultIds).toContain('ignored-1');
});
});
// ============================================================
// Phase 6c — `memory_update` standalone tool
// ============================================================
describe('Phase 6c: memory_update tool', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('mid-movement memory_update commits before terminal selection (visible to LLM next iteration)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
// iteration 0: emit memory_update only
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'mid-movement observation' }],
} },
{ type: 'done' },
],
// iteration 1: terminal complete
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(result.next).toBe('COMPLETE');
expect(memory.size().facts).toBe(1);
expect(memory.snapshot().facts[0]!.claim).toBe('mid-movement observation');
});
it('returns "no changes committed" tool_result for empty payload', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const observed: string[] = [];
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {} },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), {
workspaceMemory: memory,
});
// The next iteration's messages include the tool_result for mu-1.
const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; content?: unknown; tool_call_id?: string }>;
const muResult = secondCallMessages.find((m) => m.role === 'tool' && m.tool_call_id === 'mu-1');
expect(muResult).toBeDefined();
expect(String(muResult?.content)).toMatch(/no changes committed/);
expect(memory.size().facts).toBe(0);
});
it('exact-claim duplicate within same iteration is merged (Codex Phase 6c §2.5)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'X is Y', evidence_paths: ['a.ts'] }],
} },
{ type: 'tool_use', id: 'mu-2', name: 'memory_update', input: {
facts: [{ claim: 'X is Y', evidence_paths: ['b.ts'] }],
} },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
const snap = memory.snapshot();
expect(snap.facts).toHaveLength(1);
// evidence_paths should be union-merged, not duplicated.
expect(snap.facts[0]!.evidencePaths.sort()).toEqual(['a.ts', 'b.ts']);
});
it('memory_update commits even when complete in the same iteration is invalid (retry path)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
// iteration 0: memory_update + invalid complete (empty result) → retry
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'observation persists' }],
} },
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'done' },
],
// iteration 1: corrected complete
[{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(result.next).toBe('COMPLETE');
// The memory_update from the FIRST iteration should still be committed,
// even though the iteration retried due to invalid complete args.
expect(memory.snapshot().facts.map((f) => f.claim)).toContain('observation persists');
});
it('memory_update is a META_TOOL — works even when allowed_tools is empty', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs([]));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'still works' }],
} },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const movement: Movement = {
name: 'execute', edit: false, persona: 'p', instruction: 'i',
allowedTools: [], // empty — yet memory_update is in the tool catalog
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
});
});
// ============================================================
// Traceability T-1 — events.jsonl emission scenarios
// ============================================================
import { mkdtempSync, rmSync, readFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { createFileEventLogger, parseEventLine, type EventBase } from '../progress/event-log.js';
function readEvents(workspace: string): EventBase[] {
const path = join(workspace, 'logs', 'events.jsonl');
const lines = readFileSync(path, 'utf-8').trim().split('\n').filter((l) => l.length > 0);
return lines.map((l) => {
const parsed = parseEventLine(l);
if (parsed.kind !== 'ok') throw new Error(`unexpected non-ok event line: ${l}`);
return parsed.event;
});
}
describe('Traceability T-1: agent-loop emission', () => {
let workspace: string;
beforeEach(() => {
workspace = mkdtempSync(join(tmpdir(), 'trace-emission-'));
});
afterEach(() => {
rmSync(workspace, { recursive: true, force: true });
});
function makeContextWithEvents(workspaceLocal: string): ToolContext {
return {
workspacePath: workspaceLocal,
editAllowed: true,
eventLogger: createFileEventLogger({ workspacePath: workspaceLocal, runId: 'test-run' }),
};
}
it('emits movement_start, tool_call/result pair, and movement_complete for a single Read flow', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const client = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));
const events = readEvents(workspace);
const kinds = events.map((e) => e.kind);
expect(kinds).toContain('movement_start');
expect(kinds).toContain('tool_call');
expect(kinds).toContain('tool_result');
expect(kinds).toContain('complete');
expect(kinds).toContain('movement_complete');
const call = events.find((e) => e.kind === 'tool_call' && e.llmToolCallId === 'r1');
const result = events.find((e) => e.kind === 'tool_result' && e.llmToolCallId === 'r1');
expect(call?.correlationId).toBeTruthy();
expect(result?.correlationId).toBe(call?.correlationId);
expect((result?.payload as { cacheHit: boolean }).cacheHit).toBe(false);
});
it('emits cache_set on first Read and cache_hit on second Read of the same file', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const cache = new ToolResultCache();
const ctx = makeContextWithEvents(workspace);
const movA: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache });
const movB: Movement = {
name: 'plan', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache });
const events = readEvents(workspace);
expect(events.some((e) => e.kind === 'cache_set')).toBe(true);
expect(events.some((e) => e.kind === 'cache_hit')).toBe(true);
const cacheHit = events.find((e) => e.kind === 'cache_hit');
expect((cacheHit?.payload as { sourceMovement: string }).sourceMovement).toBe('investigate');
});
it('emits cache_invalidate and memory_invalidate after a successful Edit', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') return { output: 'body', isError: false };
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const memory = new WorkspaceMemory();
const ctx = makeContextWithEvents(workspace);
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[
{ type: 'tool_use', id: 't1', name: 'transition', input: {
next_step: 'execute',
summary: 's',
memory_update: { facts: [{ claim: 'foo.ts uses bar', evidence_paths: ['foo.ts'] }] },
} },
{ type: 'done' },
],
]);
await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache, workspaceMemory: memory });
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache, workspaceMemory: memory });
const events = readEvents(workspace);
expect(events.some((e) => e.kind === 'cache_invalidate')).toBe(true);
expect(events.some((e) => e.kind === 'memory_invalidate')).toBe(true);
});
it('emits memory_update_call with counts when the LLM calls memory_update', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const ctx = makeContextWithEvents(workspace);
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'x' }, { claim: 'y' }] } },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, ctx, { workspaceMemory: memory });
const events = readEvents(workspace);
const muCall = events.find((e) => e.kind === 'memory_update_call');
expect(muCall).toBeDefined();
const payload = muCall?.payload as { counts: { factsAdded: number } };
expect(payload.counts.factsAdded).toBe(2);
});
it('emits watchdog_fire when no checklist tool is used in 5 iterations', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const responses: Array> = [];
for (let i = 0; i < 6; i++) {
responses.push([
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
{ type: 'done' },
]);
}
responses.push([
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
]);
const client = new FakeClient(responses as never);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));
const events = readEvents(workspace);
expect(events.some((e) => e.kind === 'watchdog_fire')).toBe(true);
});
it('all events share the same runId across one piece run', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const ctx = makeContextWithEvents(workspace);
const client = new FakeClient([
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, ctx);
const events = readEvents(workspace);
expect(events.length).toBeGreaterThan(0);
const runIds = new Set(events.map((e) => e.runId));
expect(runIds.size).toBe(1);
expect(runIds.has('test-run')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Phase 4 (SSH Console): buildSystemPrompt screen injection
// ---------------------------------------------------------------------------
import { buildSystemPrompt, __setActiveSessionLookup, type HandoffContext } from './agent-loop.js';
describe('buildSystemPrompt console injection', () => {
afterEach(() => {
__setActiveSessionLookup(null);
});
function makeConsoleMovement(allowedTools: string[]): Movement {
return {
name: 'm1',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools,
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
}
it('appends screen block when SshConsole* in allowed_tools and session exists', () => {
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: 'mock-screen-line-1\nmock-screen-line-2' }),
};
__setActiveSessionLookup((_tid: string) => fakeSession);
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleEnsure', 'SshConsoleSend', 'SshConsoleSnapshot']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
expect(sys).toContain('Console screen');
expect(sys).toContain('mock-screen-line-1');
expect(sys).toContain('mock-screen-line-2');
});
it('does NOT inject when piece does not allow console tools', () => {
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: 'mock-screen-X' }),
};
__setActiveSessionLookup((_tid: string) => fakeSession);
const sys = buildSystemPrompt(
makeConsoleMovement(['Read', 'Bash']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
expect(sys).not.toContain('Console screen');
expect(sys).not.toContain('mock-screen-X');
});
it('does NOT inject when no active session is registered for the task', () => {
__setActiveSessionLookup((_tid: string) => null);
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleSend']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
expect(sys).not.toContain('Console screen');
});
it('does NOT inject when taskId is missing (subtask without local_task binding)', () => {
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: 'mock-screen' }),
};
let lookedUp: string | null = null;
__setActiveSessionLookup((tid: string) => {
lookedUp = tid;
return fakeSession;
});
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleSnapshot']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
null,
);
expect(sys).not.toContain('Console screen');
expect(lookedUp).toBeNull();
});
it('truncates injected screen to the configured tail length', () => {
const allLines = Array.from({ length: 200 }, (_, i) => `line-${i + 1}`);
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: allLines.join('\n') }),
};
__setActiveSessionLookup((_tid: string) => fakeSession);
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleSend']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
// Default tail = 24 lines: should contain the last line but not very early lines.
expect(sys).toContain('line-200');
expect(sys).toContain('line-177'); // 200 - 24 + 1 = 177 (inclusive tail)
expect(sys).not.toContain('line-1\n');
expect(sys).not.toContain('line-50');
});
});
describe('buildSystemPrompt — handoff blocks', () => {
// buildSystemPrompt is pure — we can call it directly. Build a minimal
// Movement that won't trigger errors in the rendering pipeline.
const movement: Movement = {
name: 'investigate',
persona: 'investigator',
instruction: 'do the thing',
rules: [{ condition: 'done', next: 'plan' }],
allowedTools: [],
edit: false,
};
it('always includes the static Continue block', () => {
const prompt = buildSystemPrompt(movement);
expect(prompt).toContain('Continue 機能');
expect(prompt).toContain('output/ ファイルは次の piece でもそのまま参照されます');
});
it('omits the dynamic block when handoffContext is not provided', () => {
const prompt = buildSystemPrompt(movement);
expect(prompt).not.toContain('前 piece からの引き継ぎ');
});
it('includes the dynamic block with prev piece + result when handoffContext is provided', () => {
const handoff: HandoffContext = {
prevPiece: 'manual-writer',
prevResult: 'output/manual.md is ready, with 5 sections covering setup',
};
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
expect(prompt).toContain('前 piece からの引き継ぎ');
expect(prompt).toContain('manual-writer');
expect(prompt).toContain('output/manual.md is ready');
});
it('handles null prevResult gracefully', () => {
const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: null };
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
expect(prompt).toContain('前 piece からの引き継ぎ');
expect(prompt).toContain('前 piece は最終出力を残しませんでした');
});
it('truncates very long prevResult to ~2500 chars (head + tail)', () => {
// Construct: 3000 'A's + unique 'BBBBBMIDDLE' + 3000 'C's. After
// truncation: head=2000 of 'A', truncated marker, tail=last 500 of 'C'.
// The middle marker should be cut out.
const longResult = 'A'.repeat(3000) + 'BBBBBMIDDLE' + 'C'.repeat(3000);
const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: longResult };
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
expect(prompt).toContain('[truncated]');
expect(prompt).toContain('A'.repeat(100)); // head present
expect(prompt).toContain('C'.repeat(100)); // tail present
expect(prompt).not.toContain('BBBBBMIDDLE'); // middle was cut
});
});