maestro/src/engine/agent-loop.test.ts
oss-sync c526adddc2
Some checks failed
CI / build-and-test (push) Has been cancelled
sync: update from private repo (402599f)
2026-06-04 13:41:33 +00:00

2369 lines
93 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import type { LLMEvent, ToolDef } from '../llm/openai-compat.js';
import type { ToolContext } from './tools/index.js';
import { stripThinkingTokens } from './agent-loop.js';
describe('stripThinkingTokens', () => {
it('strips <think>...</think> blocks', () => {
expect(stripThinkingTokens('<think>reasoning here</think>actual output')).toBe('actual output');
});
it('strips <|thinking|>...</|thinking|> blocks', () => {
expect(stripThinkingTokens('<|thinking|>deep thought<|/thinking|>result')).toBe('result');
});
it('strips gemma4 thought<channel|> pattern', () => {
expect(stripThinkingTokens('thought\n<channel|>')).toBe('');
expect(stripThinkingTokens('thought <channel|>hello')).toBe('hello');
});
it('strips <channel|>...<channel|> blocks', () => {
expect(stripThinkingTokens('<channel|>internal reasoning<channel|>visible output')).toBe('visible output');
});
it('returns normal text unchanged', () => {
expect(stripThinkingTokens('hello world')).toBe('hello world');
});
it('handles multiline thinking blocks', () => {
expect(stripThinkingTokens('<think>\nline1\nline2\n</think>output')).toBe('output');
});
});
const { executeToolMock, getToolDefsMock } = vi.hoisted(() => ({
executeToolMock: vi.fn(),
getToolDefsMock: vi.fn(),
}));
vi.mock('./tools/index.js', () => ({
executeTool: executeToolMock,
getToolDefs: getToolDefsMock,
}));
import { executeMovement, type Movement } from './agent-loop.js';
import { ToolResultCache } from './context/tool-result-cache.js';
import { WorkspaceMemory } from './context/workspace-memory.js';
function makeMovement(allowedTools: string[]): Movement {
return {
name: 'execute',
edit: true,
persona: 'worker',
instruction: 'Do the work.',
allowedTools,
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
}
function makeToolDefs(names: string[]): ToolDef[] {
return names.map((name) => ({
type: 'function',
function: {
name,
description: name,
parameters: { type: 'object', properties: {}, required: [] },
},
}));
}
function makeContext(): ToolContext {
return {
workspacePath: '/tmp/agent-loop-test',
editAllowed: true,
};
}
function deferred<T>(): { promise: Promise<T>; resolve: (value: T) => void } {
let resolve!: (value: T) => void;
const promise = new Promise<T>((res) => {
resolve = res;
});
return { promise, resolve };
}
async function waitFor(predicate: () => boolean, timeoutMs: number = 250): Promise<void> {
const start = Date.now();
while (!predicate()) {
if (Date.now() - start > timeoutMs) {
throw new Error('Timed out waiting for condition');
}
await new Promise((resolve) => setTimeout(resolve, 0));
}
}
class FakeClient {
private index = 0;
readonly calls: Array<{ messages: unknown; tools?: unknown }> = [];
constructor(private readonly responses: LLMEvent[][]) {}
async *chat(messages: unknown, tools?: unknown, _signal?: AbortSignal): AsyncGenerator<LLMEvent> {
this.calls.push({ messages, tools });
const response = this.responses[this.index++] ?? [];
for (const event of response) {
yield event;
}
}
}
describe('executeMovement parallel tool execution', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('runs consecutive safe tool calls in parallel', async () => {
const readGate = deferred<{ output: string; isError: boolean }>();
const globGate = deferred<{ output: string; isError: boolean }>();
const started: string[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
executeToolMock.mockImplementation(async (name: string) => {
started.push(name);
if (name === 'Read') return readGate.promise;
if (name === 'Glob') return globGate.promise;
return { output: 'ok', isError: false };
});
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());
await waitFor(() => started.length === 2);
expect(started).toEqual(['Read', 'Glob']);
readGate.resolve({ output: 'read ok', isError: false });
globGate.resolve({ output: 'glob ok', isError: false });
const result = await runPromise;
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(2);
});
it('keeps side-effecting tools sequential as a barrier', async () => {
const readGate = deferred<{ output: string; isError: boolean }>();
const writeGate = deferred<{ output: string; isError: boolean }>();
const started: string[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Write']));
executeToolMock.mockImplementation(async (name: string) => {
started.push(name);
if (name === 'Read') return readGate.promise;
if (name === 'Write') return writeGate.promise;
return { output: 'ok', isError: false };
});
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'tool_use', id: 'write-1', name: 'Write', input: { file_path: 'output/a.txt', content: 'x' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const runPromise = executeMovement(makeMovement(['Read', 'Write']), 'task', client as never, makeContext());
await waitFor(() => started.length === 1);
expect(started).toEqual(['Read']);
readGate.resolve({ output: 'read ok', isError: false });
await waitFor(() => started.length === 2);
expect(started).toEqual(['Read', 'Write']);
writeGate.resolve({ output: 'write ok', isError: false });
const result = await runPromise;
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(2);
});
it('executes regular tools before transition even if transition appears mid-batch', async () => {
const readGate = deferred<{ output: string; isError: boolean }>();
const globGate = deferred<{ output: string; isError: boolean }>();
const started: string[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
executeToolMock.mockImplementation(async (name: string) => {
started.push(name);
if (name === 'Read') return readGate.promise;
if (name === 'Glob') return globGate.promise;
return { output: 'glob ok', isError: false };
});
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
{ type: 'done' },
],
]);
const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());
await waitFor(() => started.length === 2);
expect(started).toEqual(['Read', 'Glob']);
readGate.resolve({ output: 'read ok', isError: false });
globGate.resolve({ output: 'glob ok', isError: false });
const result = await runPromise;
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(2);
});
it('emits text, preview, and tool callbacks in order', async () => {
const textChunks: string[] = [];
const previews: Array<{ movement: string; preview: string }> = [];
const toolUses: Array<{ name: string; input: Record<string, unknown> }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
const client = new FakeClient([
[
{ type: 'text', text: 'first ' },
{ type: 'text', text: 'second' },
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onText: (text) => textChunks.push(text),
onTextPreview: (movementName, preview) => previews.push({ movement: movementName, preview }),
onToolUse: (toolName, input) => toolUses.push({ name: toolName, input }),
},
},
);
expect(result.next).toBe('COMPLETE');
expect(textChunks).toEqual(['first ', 'second']);
expect(previews).toEqual([{ movement: 'execute', preview: 'first second' }]);
expect(toolUses).toContainEqual({ name: 'Read', input: { file_path: 'input/a.txt' } });
});
it('fires onToolResult callback after tool execution', async () => {
const toolResults: Array<{ name: string; isError: boolean }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onToolResult: (name, info) => toolResults.push({ name, isError: info.isError }),
},
},
);
expect(toolResults).toContainEqual({ name: 'Read', isError: false });
});
it('fires onMemoryCheckpoint callback after regular tool use', async () => {
const checkpoints: number[] = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onMemoryCheckpoint: (count) => checkpoints.push(count),
},
},
);
expect(checkpoints).toEqual([1]);
});
it('triggers force_transition when context manager signals exhaustion', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({
thresholds: [{ ratio: 0.005, action: 'force_transition' }],
limitTokens: 10_000,
});
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done', usage: { prompt_tokens: 80, completion_tokens: 10 } },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
expect(result.output).toContain('Context limit reached');
});
it('fires onContextAction callback when context threshold crossed', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({
thresholds: [{ ratio: 0.005, action: 'warn' }],
limitTokens: 10_000,
});
const contextActions: Array<{ type: string }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done', usage: { prompt_tokens: 60, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onContextAction: (action) => contextActions.push({ type: action.type }),
},
contextManager: cm,
},
);
expect(contextActions).toContainEqual({ type: 'warn' });
});
it('fires onContextUpdate callback after each LLM usage update', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 100_000 });
const updates: Array<{ promptTokens: number; limitTokens: number }> = [];
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done', usage: { prompt_tokens: 1234, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{
callbacks: {
onContextUpdate: (payload) => updates.push(payload),
},
contextManager: cm,
},
);
expect(updates.length).toBeGreaterThanOrEqual(1);
expect(updates).toContainEqual({ promptTokens: 0, limitTokens: 100_000 });
expect(updates).toContainEqual({ promptTokens: 1234, limitTokens: 100_000 });
});
it('compacts oversized tool results before the next LLM request', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 20_000 });
const hugeBase64 = `<html><img src="data:text/html;base64,${'A'.repeat(80_000)}"></html>`;
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: hugeBase64, isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/page.html' } },
{ type: 'done', usage: { prompt_tokens: 1000, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done', usage: { prompt_tokens: 2000, completion_tokens: 10 } },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
expect(client.calls).toHaveLength(2);
const secondMessages = client.calls[1]!.messages as Array<{ role: string; content?: string }>;
const toolMessage = secondMessages.find((message) => message.role === 'tool');
expect(toolMessage?.content).toContain('[Tool result omitted before LLM request]');
expect(toolMessage?.content).not.toContain(hugeBase64);
// The agent should also receive a user-facing instruction to retry with a
// narrower scope, instead of silently dropping content.
const feedbackMessage = secondMessages.find(
(message) => message.role === 'user' && message.content?.includes('LLM コンテキストに入れられませんでした'),
);
expect(feedbackMessage).toBeDefined();
expect(feedbackMessage?.content).toMatch(/Read\(offset\/limit\)|Grep/);
});
it('dedupes duplicate file reads before compacting tool results', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 40_000 });
// 80% guard => 32_000 tokens. Each large ASCII read is roughly 20k tokens.
// Two duplicate reads exceed the guard without dedup; after dedup, only
// the latest read survives and the request fits.
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const largeContent = 'X'.repeat(72_000);
executeToolMock.mockResolvedValue({ output: largeContent, isError: false });
const client = new FakeClient([
[
{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: '/dup.ts' } },
{ type: 'done', usage: { prompt_tokens: 500, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: '/dup.ts' } },
{ type: 'done', usage: { prompt_tokens: 18_000, completion_tokens: 10 } },
],
[
{ type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
expect(client.calls.length).toBe(3);
// The 3rd LLM call should see the older Read replaced with a placeholder
const thirdCallMessages = client.calls[2]!.messages as Array<{ role: string; content?: string }>;
const placeholderMessages = thirdCallMessages.filter(
(m) => m.role === 'tool' && typeof m.content === 'string' && m.content.includes('Duplicate Read of /dup.ts'),
);
expect(placeholderMessages.length).toBe(1);
// The latest Read should still carry the full content
const fullReadMessages = thirdCallMessages.filter(
(m) => m.role === 'tool' && typeof m.content === 'string' && m.content === largeContent,
);
expect(fullReadMessages.length).toBe(1);
});
it('summarizes old turns when dedup+compact alone cannot fit prompt', async () => {
const { ContextManager } = await import('./context-manager.js');
const { SUMMARY_MARKER_PREFIX } = await import('./context/history-compactor.js');
// Tight context, distinct Bash outputs (so dedup cannot help) each below
// LARGE_TOOL_RESULT_TOKENS (so compaction cannot help either) — only
// history summarization can keep the conversation going.
// Phase 6c expanded the system prompt by ~200 chars (memory_update
// guidance + new tool definition); we bump the test limit a bit so
// summarization still fires on the 5th Bash turn rather than the 4th,
// matching the FakeClient slot layout below.
const cm = new ContextManager({ limitTokens: 35_000 });
getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
executeToolMock.mockResolvedValue({ output: 'B'.repeat(18_000), isError: false });
// FakeClient.chat() walks a single response list, so the isolated
// summarization LLM call (runIsolatedLlm) consumes the next slot. Slot 5
// is the summary response; slot 6 is the post-summary transition.
const client = new FakeClient([
[{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo 1' } }, { type: 'done', usage: { prompt_tokens: 1_000, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b2', name: 'Bash', input: { command: 'echo 2' } }, { type: 'done', usage: { prompt_tokens: 5_500, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b3', name: 'Bash', input: { command: 'echo 3' } }, { type: 'done', usage: { prompt_tokens: 10_000, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b4', name: 'Bash', input: { command: 'echo 4' } }, { type: 'done', usage: { prompt_tokens: 15_000, completion_tokens: 5 } }],
[{ type: 'tool_use', id: 'b5', name: 'Bash', input: { command: 'echo 5' } }, { type: 'done', usage: { prompt_tokens: 20_000, completion_tokens: 5 } }],
[{ type: 'text', text: '## ゴール\nrun bash commands\n## 進捗\nDone: 5 commands' }, { type: 'done' }],
[
{ type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Bash']),
'task',
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('COMPLETE');
const finalMessages = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: string }>;
const markerMessages = finalMessages.filter(
(m) => m.role === 'user' && typeof m.content === 'string' && m.content.startsWith(SUMMARY_MARKER_PREFIX),
);
// Phase 6c expanded the system prompt; the exact iteration where
// summarization fires shifted slightly. The invariant we care about
// is "summarization fired at least once before completion".
expect(markerMessages.length).toBeGreaterThanOrEqual(1);
});
it('aborts when initial prompt is oversized and defaultNext is terminal', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 1_000 });
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
// The guard will fail before the main LLM is called, but the
// last-resort summary path uses runIsolatedLlm (also via FakeClient),
// so we provide one response for that summary call.
const client = new FakeClient([
[{ type: 'text', text: '### Status\nstuck on overflow' }, { type: 'done' }],
]);
const result = await executeMovement(
makeMovement(['Read']),
'x'.repeat(10_000),
client as never,
makeContext(),
{ contextManager: cm },
);
// A terminal fallback would report a false completion, so overflow aborts.
expect(result.next).toBe('ABORT');
expect(result.abortCode).toBe('context_overflow');
expect(result.output).toContain('LLM request blocked before send');
// Only the isolated summary call should have happened — no main LLM call
expect(executeToolMock).not.toHaveBeenCalled();
});
it('falls back to ABORT when oversized prompt has no defaultNext to force-transition to', async () => {
const { ContextManager } = await import('./context-manager.js');
const cm = new ContextManager({ limitTokens: 1_000 });
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[{ type: 'text', text: 'should not be called' }, { type: 'done' }],
]);
// Movement without defaultNext
const movement: Movement = {
name: 'execute',
edit: true,
persona: 'worker',
instruction: 'Do the work.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
};
const result = await executeMovement(
movement,
'x'.repeat(10_000),
client as never,
makeContext(),
{ contextManager: cm },
);
expect(result.next).toBe('ABORT');
expect(result.output).toContain('LLM request blocked before send');
expect(executeToolMock).not.toHaveBeenCalled();
});
it('returns ABORT immediately when cancelSignal is already aborted', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
]);
const controller = new AbortController();
controller.abort();
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ cancelSignal: controller.signal },
);
expect(result.next).toBe('ABORT');
expect(result.output).toContain('cancelled');
expect(executeToolMock).not.toHaveBeenCalled();
});
it('aborts after text-only responses without transition calls', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
// LLM returns text-only responses (no tool calls, no transition) 4 times
// MAX_TEXT_ONLY_RETRIES is 3, so after 3 text-only responses it should ABORT
const client = new FakeClient([
[{ type: 'text', text: 'I will do the work' }, { type: 'done' }],
[{ type: 'text', text: 'Still working on it' }, { type: 'done' }],
[{ type: 'text', text: 'Almost done' }, { type: 'done' }],
// Should not reach this
[{ type: 'text', text: 'Never reached' }, { type: 'done' }],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
);
expect(result.next).toBe('ABORT');
expect(executeToolMock).not.toHaveBeenCalled();
});
it('resets text-only counter when tool calls happen in between', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// Alternate: text-only → tool → text-only → tool → text-only → tool → complete
// Without the fix, the 3 text-only responses would trigger abort.
// With the fix, each tool call resets the counter so it never reaches 3 consecutive.
const client = new FakeClient([
[{ type: 'text', text: 'Thinking about the answer...' }, { type: 'done' }],
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' },
],
[{ type: 'text', text: 'Found some info, analyzing...' }, { type: 'done' }],
[
{ type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'b.txt' } },
{ type: 'done' },
],
[{ type: 'text', text: 'Almost done with analysis' }, { type: 'done' }],
[
{ type: 'tool_use', id: 'complete-1', name: 'complete', input: { status: 'success', result: 'Done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
);
expect(result.next).not.toBe('ABORT');
expect(executeToolMock).toHaveBeenCalled();
});
it('aborts when maxIterations is exceeded', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// LLM keeps using tools but never calls transition
const toolResponses = Array.from({ length: 4 }, () => [
{ type: 'tool_use' as const, id: `read-${Math.random()}`, name: 'Read', input: { file_path: 'a.txt' } },
{ type: 'done' as const },
]);
const client = new FakeClient(toolResponses);
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ maxIterations: 2 },
);
expect(result.next).toBe('ABORT');
expect(result.output).toContain('iteration');
});
});
describe('executeMovement cross-movement Read cache', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('returns a cached Read result when a later movement reads the same file', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const fileContent = 'const answer = 42;\n';
executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
if (name === 'Read' && input['file_path'] === 'foo.ts') {
return { output: fileContent, isError: false };
}
return { output: 'unexpected', isError: true };
});
const cache = new ToolResultCache();
const movementA: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Look at foo.ts.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'foo.ts' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'transition', input: { next_step: 'plan', summary: 'foo.ts read' } },
{ type: 'done' },
],
]);
const resultA = await executeMovement(movementA, 'task', clientA as never, makeContext(), {
toolResultCache: cache,
});
expect(resultA.next).toBe('plan');
expect(executeToolMock).toHaveBeenCalledTimes(1);
expect(cache.size()).toBe(1);
const observedToolResults: string[] = [];
const movementB: Movement = {
name: 'plan',
edit: false,
persona: 'planner',
instruction: 'Plan based on foo.ts.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[
{ type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'foo.ts' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-2', name: 'complete', input: { status: 'success', result: 'used cached read' } },
{ type: 'done' },
],
]);
const resultB = await executeMovement(movementB, 'task', clientB as never, makeContext(), {
toolResultCache: cache,
callbacks: {
onToolResult: (_name, info) => observedToolResults.push(info.result),
},
});
expect(resultB.next).toBe('COMPLETE');
// executeTool was NOT invoked again — the cache short-circuited the second Read.
expect(executeToolMock).toHaveBeenCalledTimes(1);
// The LLM saw a cached-tagged result, not the raw content.
expect(observedToolResults.some((r) => r.startsWith('[cached: Read foo.ts from movement investigate at '))).toBe(true);
expect(observedToolResults.some((r) => r.endsWith(fileContent))).toBe(true);
});
it('does not cache error results', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ENOENT: no such file', isError: true });
const cache = new ToolResultCache();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Look at missing.ts.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'missing.ts' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'tried' } },
{ type: 'done' },
],
]);
await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0);
});
it('skips caching tools outside the cacheable allowlist (Bash etc.)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
executeToolMock.mockResolvedValue({ output: 'shell ok', isError: false });
const cache = new ToolResultCache();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Run.',
allowedTools: ['Bash'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[
{ type: 'tool_use', id: 'b-1', name: 'Bash', input: { command: 'echo hi' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'ran' } },
{ type: 'done' },
],
]);
await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0);
});
});
describe('executeMovement Phase 2 cache invalidation', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('invalidates cached Read after a successful Edit on the same path', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
const reads: number[] = [];
let version = 1;
executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
if (name === 'Read' && input['file_path'] === 'foo.ts') {
reads.push(version);
return { output: `version=${version}`, isError: false };
}
if (name === 'Edit' && input['file_path'] === 'foo.ts') {
version++;
return { output: 'edited', isError: false };
}
return { output: 'unexpected', isError: true };
});
const cache = new ToolResultCache();
const movementA: Movement = {
name: 'investigate',
edit: true,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 'read' } }, { type: 'done' }],
]);
await executeMovement(movementA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(1);
const movementB: Movement = {
name: 'execute',
edit: true,
persona: 'p',
instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'verify' }],
defaultNext: 'verify',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'transition', input: { next_step: 'verify', summary: 'edited' } }, { type: 'done' }],
]);
await executeMovement(movementB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0); // Edit evicted the Read entry
const observed: string[] = [];
const movementC: Movement = {
name: 'verify',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientC = new FakeClient([
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't3', name: 'complete', input: { status: 'success', result: 're-read' } }, { type: 'done' }],
]);
await executeMovement(movementC, 'task', clientC as never, makeContext(), {
toolResultCache: cache,
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
});
// Re-read happened (no cache hit) and saw the post-edit version.
expect(reads).toEqual([1, 2]);
expect(observed.some((r) => r === 'version=2')).toBe(true);
expect(observed.some((r) => r.startsWith('[cached:'))).toBe(false);
});
it('invalidates only the affected path; other cached files survive', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
const callLog: Array<{ name: string; path?: string }> = [];
executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
callLog.push({ name, path: input['file_path'] as string | undefined });
if (name === 'Read') return { output: `${input['file_path']} body`, isError: false };
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(2);
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit', 'Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const observed: string[] = [];
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[
{ type: 'tool_use', id: 'r3', name: 'Read', input: { file_path: 'foo.ts' } },
{ type: 'tool_use', id: 'r4', name: 'Read', input: { file_path: 'bar.ts' } },
{ type: 'done' },
],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), {
toolResultCache: cache,
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
});
const fooReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'foo.ts').length;
const barReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'bar.ts').length;
expect(fooReadCount).toBe(2); // re-read after Edit
expect(barReadCount).toBe(1); // bar still cached
expect(observed.some((r) => r.startsWith('[cached: Read bar.ts'))).toBe(true);
expect(observed.some((r) => r.startsWith('[cached: Read foo.ts'))).toBe(false);
});
it('Bash invalidates every file-derived entry conservatively', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Bash']));
let readCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') {
readCalls++;
return { output: `read ${readCalls}`, isError: false };
}
if (name === 'Bash') return { output: 'shell ok', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(2);
const movB: Movement = {
name: 'execute', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Bash'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo hi' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(0); // every Read evicted
});
it('does NOT invalidate when Edit/Write/Bash returned an error', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
let editAttempts = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') return { output: 'body', isError: false };
if (name === 'Edit') {
editAttempts++;
return { output: 'permission denied', isError: true };
}
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(cache.size()).toBe(1);
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(editAttempts).toBe(1);
expect(cache.size()).toBe(1); // failed Edit kept the cache intact
});
});
describe('executeMovement Phase 3 WorkspaceMemory', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('persists memory_update from one movement and exposes it in the next system prompt', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const memory = new WorkspaceMemory();
const movementA: Movement = {
name: 'investigate',
edit: false,
persona: 'investigator',
instruction: 'Find the bug.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[
{
type: 'tool_use',
id: 't1',
name: 'transition',
input: {
next_step: 'plan',
summary: 'investigated',
memory_update: {
facts: [
{ claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'], confidence: 'high' },
],
decisions: [{ text: 'patch foo.ts directly', evidence_paths: ['foo.ts'] }],
do_not_repeat: ['re-read foo.ts unless evidence breaks'],
},
},
},
{ type: 'done' },
],
]);
await executeMovement(movementA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
expect(memory.size().decisions).toBe(1);
expect(memory.size().doNotRepeat).toBe(1);
const movementB: Movement = {
name: 'plan',
edit: false,
persona: 'planner',
instruction: 'Plan the fix.',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'planned' } }, { type: 'done' }],
]);
await executeMovement(movementB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });
const systemMsg = clientB.calls[0]?.messages as Array<{ role: string; content: string }>;
expect(systemMsg[0]!.role).toBe('system');
expect(systemMsg[0]!.content).toContain('## これまでに蓄積した観測');
expect(systemMsg[0]!.content).toContain('foo.ts uses bar()');
expect(systemMsg[0]!.content).toContain('patch foo.ts directly');
expect(systemMsg[0]!.content).toContain('re-read foo.ts unless evidence breaks');
expect(systemMsg[0]!.content).toContain('memory は再調査禁止の根拠ではなく');
});
it('omits memory section when memory is empty', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
expect(systemMsg[0]!.content).not.toContain('## これまでに蓄積した観測');
});
it('Edit invalidates a fact whose evidence_paths includes the edited file', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') return { output: 'body', isError: false };
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const memory = new WorkspaceMemory();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[
{
type: 'tool_use',
id: 't1',
name: 'transition',
input: {
next_step: 'execute',
summary: 'investigated',
memory_update: {
facts: [
{ claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'] },
{ claim: 'unrelated truth', evidence_paths: ['bar.ts'] },
],
},
},
},
{ type: 'done' },
],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });
expect(memory.snapshot().facts).toHaveLength(2);
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'edited' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });
const snapshotAfter = memory.snapshot();
expect(snapshotAfter.facts).toHaveLength(1);
expect(snapshotAfter.facts[0]!.claim).toBe('unrelated truth');
});
it('still works when piece submits no memory_update (backward compat)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const memory = new WorkspaceMemory();
const movement: Movement = {
name: 'investigate',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'just summary' } }, { type: 'done' }],
]);
const result = await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(result.next).toBe('COMPLETE');
expect(memory.size().facts).toBe(0);
});
});
describe('executeMovement Phase 4 cache extension', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('caches Grep results and serves them to a later movement', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep']));
let grepCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Grep') {
grepCalls++;
return { output: 'matches in src/foo.ts:1\n', isError: false };
}
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Grep'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(grepCalls).toBe(1);
const observed: string[] = [];
const movB: Movement = {
name: 'plan', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Grep'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), {
toolResultCache: cache,
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
});
expect(grepCalls).toBe(1);
expect(observed.some((r) => r.startsWith('[cached: Grep foo'))).toBe(true);
});
it('Edit invalidates ALL search entries (Grep) regardless of scope', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep', 'Edit']));
let grepCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Grep') {
grepCalls++;
return { output: `match v${grepCalls}`, isError: false };
}
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Grep'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit', 'Grep'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'unrelated.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(grepCalls).toBe(2);
});
it('caches WebFetch and survives Edit invalidation', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['WebFetch', 'Edit']));
let fetchCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'WebFetch') {
fetchCalls++;
return { output: 'page body', isError: false };
}
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['WebFetch'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'w1', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit', 'WebFetch'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'w2', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(fetchCalls).toBe(1);
});
it('caches Office tool results (ReadPdf) keyed by path + range', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['ReadPdf']));
let pdfCalls = 0;
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'ReadPdf') {
pdfCalls++;
return { output: 'pdf body', isError: false };
}
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const mov: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['ReadPdf'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'p1', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(mov, 'task', clientA as never, makeContext(), { toolResultCache: cache });
expect(pdfCalls).toBe(1);
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'p2', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'p3', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '4-6' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(mov, 'task', clientB as never, makeContext(), { toolResultCache: cache });
expect(pdfCalls).toBe(2);
});
});
describe('buildSystemPrompt checklist guidance', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('emits a "進捗管理" section with strengthened CreateChecklist guidance', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const movement: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
]);
await executeMovement(movement, 'task', client as never, makeContext());
const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
expect(systemMsg[0]!.role).toBe('system');
expect(systemMsg[0]!.content).toContain('## 進捗管理');
expect(systemMsg[0]!.content).toContain('CreateChecklist');
expect(systemMsg[0]!.content).toContain('全 piece で常時利用可能');
// Stronger language (option A): explicit "3 個以上のツール" criterion +
// "判断に迷ったら作る" + 2 回目以降への明示注意。
expect(systemMsg[0]!.content).toContain('3 個以上のツール呼び出し');
expect(systemMsg[0]!.content).toContain('判断に迷ったら作る');
expect(systemMsg[0]!.content).toContain('2 回目以降のやり取り');
});
});
describe('checklist watchdog (option B)', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('injects a one-shot reminder after 5 iterations without any checklist tool', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// 6 iterations of plain Read → 1 iteration with complete. The watchdog
// should fire on iteration 5 (0-indexed).
const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown; text?: string }>> = [];
for (let i = 0; i < 6; i++) {
responses.push([
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
{ type: 'done' },
]);
}
responses.push([
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
]);
const client = new FakeClient(responses as never);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
// Find any user-role message containing the watchdog tag.
const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
const watchdogMessages = allMessagesAtFinalCall.filter(
(m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
);
expect(watchdogMessages.length).toBe(1);
});
it('does NOT fire when CreateChecklist is called early', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'CreateChecklist']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
// CreateChecklist on iteration 0, then several Read, then complete.
const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown }>> = [
[{ type: 'tool_use', id: 'cc', name: 'CreateChecklist', input: { name: 'plan', items: [] } }, { type: 'done' }],
];
for (let i = 0; i < 7; i++) {
responses.push([
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
{ type: 'done' },
]);
}
responses.push([
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
]);
const client = new FakeClient(responses as never);
await executeMovement(makeMovement(['Read', 'CreateChecklist']), 'task', client as never, makeContext());
const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
const watchdogMessages = allMessagesAtFinalCall.filter(
(m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
);
expect(watchdogMessages.length).toBe(0);
});
});
// ============================================================
// Phase 6a — `complete` tool (§7.17.7 of design plan)
// ============================================================
describe('Phase 6a: complete tool — §7.1 native basics', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('success status with result becomes the movement output (no chatter contamination)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'text', text: "I'll figure this out first" }, // chatter to be ignored
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'final answer' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('COMPLETE');
expect(result.output).toBe('final answer');
expect(result.output).not.toContain("I'll figure this out first");
});
it('aborted status routes via ABORT next', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted', abort_reason: 'tool unavailable' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('ABORT');
expect(result.output).toBe('tool unavailable');
});
it('needs_user_input routes via ASK next', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{
type: 'tool_use', id: 'c1', name: 'complete',
input: { status: 'needs_user_input', missing_info: 'which dataset?', why_no_default: 'multiple options' },
},
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('ASK');
expect(result.output).toBe('which dataset?');
});
it('rejects success with empty result and forces retry (no accumulatedText fallback for native)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'text', text: 'this would have been the fallback' },
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'real answer' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('COMPLETE');
expect(result.output).toBe('real answer');
// The fallback chatter must NOT have leaked into the output via retry.
expect(result.output).not.toContain('this would have been the fallback');
});
it('rejects aborted without abort_reason and forces retry', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'aborted', abort_reason: 'real reason' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('ABORT');
expect(result.output).toBe('real reason');
});
});
describe('Phase 6a: complete tool — §7.2 precedence rules (§2.5)', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('invalid native complete forces retry (no fallback)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
// First iteration: invalid (empty result) — must retry
[
{ type: 'text', text: 'this would have been the fallback' },
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'done' },
],
// Second iteration: LLM corrects with valid native
[
{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'native corrected' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.output).toBe('native corrected');
// accumulatedText must NOT have leaked into the output via retry
expect(result.output).not.toContain('fallback');
});
it('two native completes with conflicting args → retry', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'A' } },
{ type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'B' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'consistent' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.output).toBe('consistent');
});
it('two native completes with identical args → first one used', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'same answer' } },
{ type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'same answer' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.output).toBe('same answer');
});
});
describe('Phase 6a: complete tool — §7.4 regressions', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('non-terminal transition (movement-to-movement) still works', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const movement: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const client = new FakeClient([
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 'investigated' } }, { type: 'done' }],
]);
const result = await executeMovement(movement, 'task', client as never, makeContext());
expect(result.next).toBe('plan');
expect(result.output).toBe('investigated');
});
it('transition({next_step:"COMPLETE"}) is rejected (Phase 6b)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
// FakeClient bypasses tool-schema validation, but the engine's
// validateTransition still rejects unknown next values, so an LLM
// emitting a legacy terminal transition gets a tool-result error and
// must retry. This locks in the no-shim behavior.
const client = new FakeClient([
[{
type: 'tool_use', id: 't1', name: 'transition',
input: { next_step: 'COMPLETE', summary: 'should not be accepted' },
}, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'corrected' } }, { type: 'done' }],
]);
const movement: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
// No COMPLETE in rules — legacy terminal cannot succeed.
rules: [{ condition: 'fix needed', next: 'fix' }],
};
const result = await executeMovement(movement, 'task', client as never, makeContext());
expect(result.output).toBe('corrected');
});
});
describe('Phase 6a: complete tool — §7.5 memory_update behavior', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('memory_update inside complete is applied to WorkspaceMemory (success)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[{
type: 'tool_use', id: 'c1', name: 'complete',
input: {
status: 'success',
result: 'done',
memory_update: { facts: [{ claim: 'X is Y', confidence: 'high' }] },
},
}, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
});
it('memory_update is NOT applied when complete is invalid and retried', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
// Invalid first attempt — memory_update must NOT commit
[{
type: 'tool_use', id: 'c1', name: 'complete',
input: {
status: 'success',
result: '',
memory_update: { facts: [{ claim: 'should not commit', confidence: 'high' }] },
},
}, { type: 'done' }],
// Valid retry — different fact, must commit only this one
[{
type: 'tool_use', id: 'c2', name: 'complete',
input: {
status: 'success',
result: 'fixed',
memory_update: { facts: [{ claim: 'committed once', confidence: 'high' }] },
},
}, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
expect(memory.snapshot().facts[0]!.claim).toBe('committed once');
});
it('memory_update applied for status=aborted', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[{
type: 'tool_use', id: 'c1', name: 'complete',
input: {
status: 'aborted',
abort_reason: 'tool unavailable',
memory_update: { facts: [{ claim: 'lesson from failure', confidence: 'medium' }] },
},
}, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
});
});
describe('Phase 6a: complete tool — §7.7 v3 critical (Conditional Go) tests', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('all tool_use ids get a tool_result on retry (conversation history integrity)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const client = new FakeClient([
[
{ type: 'tool_use', id: 'fail-1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'tool_use', id: 'ignored-1', name: 'complete', input: { status: 'success', result: 'X' } },
{ type: 'done' },
],
[
{ type: 'tool_use', id: 'good', name: 'complete', input: { status: 'success', result: 'recovered' } },
{ type: 'done' },
],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
// Inspect the messages sent on the second call — every prior tool_use must have a tool_result.
const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; tool_call_id?: string }>;
const toolResultIds = secondCallMessages.filter((m) => m.role === 'tool').map((m) => m.tool_call_id);
expect(toolResultIds).toContain('fail-1');
expect(toolResultIds).toContain('ignored-1');
});
});
// ============================================================
// Phase 6c — `memory_update` standalone tool
// ============================================================
describe('Phase 6c: memory_update tool', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('mid-movement memory_update commits before terminal selection (visible to LLM next iteration)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
// iteration 0: emit memory_update only
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'mid-movement observation' }],
} },
{ type: 'done' },
],
// iteration 1: terminal complete
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(result.next).toBe('COMPLETE');
expect(memory.size().facts).toBe(1);
expect(memory.snapshot().facts[0]!.claim).toBe('mid-movement observation');
});
it('returns "no changes committed" tool_result for empty payload', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const observed: string[] = [];
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {} },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), {
workspaceMemory: memory,
});
// The next iteration's messages include the tool_result for mu-1.
const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; content?: unknown; tool_call_id?: string }>;
const muResult = secondCallMessages.find((m) => m.role === 'tool' && m.tool_call_id === 'mu-1');
expect(muResult).toBeDefined();
expect(String(muResult?.content)).toMatch(/no changes committed/);
expect(memory.size().facts).toBe(0);
});
it('exact-claim duplicate within same iteration is merged (Codex Phase 6c §2.5)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'X is Y', evidence_paths: ['a.ts'] }],
} },
{ type: 'tool_use', id: 'mu-2', name: 'memory_update', input: {
facts: [{ claim: 'X is Y', evidence_paths: ['b.ts'] }],
} },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
const snap = memory.snapshot();
expect(snap.facts).toHaveLength(1);
// evidence_paths should be union-merged, not duplicated.
expect(snap.facts[0]!.evidencePaths.sort()).toEqual(['a.ts', 'b.ts']);
});
it('memory_update commits even when complete in the same iteration is invalid (retry path)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const client = new FakeClient([
// iteration 0: memory_update + invalid complete (empty result) → retry
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'observation persists' }],
} },
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
{ type: 'done' },
],
// iteration 1: corrected complete
[{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(result.next).toBe('COMPLETE');
// The memory_update from the FIRST iteration should still be committed,
// even though the iteration retried due to invalid complete args.
expect(memory.snapshot().facts.map((f) => f.claim)).toContain('observation persists');
});
it('memory_update is a META_TOOL — works even when allowed_tools is empty', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs([]));
const memory = new WorkspaceMemory();
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
facts: [{ claim: 'still works' }],
} },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const movement: Movement = {
name: 'execute', edit: false, persona: 'p', instruction: 'i',
allowedTools: [], // empty — yet memory_update is in the tool catalog
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
expect(memory.size().facts).toBe(1);
});
});
// ============================================================
// Traceability T-1 — events.jsonl emission scenarios
// ============================================================
import { mkdtempSync, rmSync, readFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { createFileEventLogger, parseEventLine, type EventBase } from '../progress/event-log.js';
function readEvents(workspace: string): EventBase[] {
const path = join(workspace, 'logs', 'events.jsonl');
const lines = readFileSync(path, 'utf-8').trim().split('\n').filter((l) => l.length > 0);
return lines.map((l) => {
const parsed = parseEventLine(l);
if (parsed.kind !== 'ok') throw new Error(`unexpected non-ok event line: ${l}`);
return parsed.event;
});
}
describe('Traceability T-1: agent-loop emission', () => {
let workspace: string;
beforeEach(() => {
workspace = mkdtempSync(join(tmpdir(), 'trace-emission-'));
});
afterEach(() => {
rmSync(workspace, { recursive: true, force: true });
});
function makeContextWithEvents(workspaceLocal: string): ToolContext {
return {
workspacePath: workspaceLocal,
editAllowed: true,
eventLogger: createFileEventLogger({ workspacePath: workspaceLocal, runId: 'test-run' }),
};
}
it('emits movement_start, tool_call/result pair, and movement_complete for a single Read flow', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const client = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));
const events = readEvents(workspace);
const kinds = events.map((e) => e.kind);
expect(kinds).toContain('movement_start');
expect(kinds).toContain('tool_call');
expect(kinds).toContain('tool_result');
expect(kinds).toContain('complete');
expect(kinds).toContain('movement_complete');
const call = events.find((e) => e.kind === 'tool_call' && e.llmToolCallId === 'r1');
const result = events.find((e) => e.kind === 'tool_result' && e.llmToolCallId === 'r1');
expect(call?.correlationId).toBeTruthy();
expect(result?.correlationId).toBe(call?.correlationId);
expect((result?.payload as { cacheHit: boolean }).cacheHit).toBe(false);
});
it('emits cache_set on first Read and cache_hit on second Read of the same file', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
const cache = new ToolResultCache();
const ctx = makeContextWithEvents(workspace);
const movA: Movement = {
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'plan' }],
defaultNext: 'plan',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
]);
await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache });
const movB: Movement = {
name: 'plan', edit: false, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache });
const events = readEvents(workspace);
expect(events.some((e) => e.kind === 'cache_set')).toBe(true);
expect(events.some((e) => e.kind === 'cache_hit')).toBe(true);
const cacheHit = events.find((e) => e.kind === 'cache_hit');
expect((cacheHit?.payload as { sourceMovement: string }).sourceMovement).toBe('investigate');
});
it('emits cache_invalidate and memory_invalidate after a successful Edit', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
executeToolMock.mockImplementation(async (name: string) => {
if (name === 'Read') return { output: 'body', isError: false };
if (name === 'Edit') return { output: 'edited', isError: false };
return { output: 'x', isError: true };
});
const cache = new ToolResultCache();
const memory = new WorkspaceMemory();
const ctx = makeContextWithEvents(workspace);
const movA: Movement = {
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Read'],
rules: [{ condition: 'done', next: 'execute' }],
defaultNext: 'execute',
};
const clientA = new FakeClient([
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
[
{ type: 'tool_use', id: 't1', name: 'transition', input: {
next_step: 'execute',
summary: 's',
memory_update: { facts: [{ claim: 'foo.ts uses bar', evidence_paths: ['foo.ts'] }] },
} },
{ type: 'done' },
],
]);
await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache, workspaceMemory: memory });
const movB: Movement = {
name: 'execute', edit: true, persona: 'p', instruction: 'i',
allowedTools: ['Edit'],
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
const clientB = new FakeClient([
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache, workspaceMemory: memory });
const events = readEvents(workspace);
expect(events.some((e) => e.kind === 'cache_invalidate')).toBe(true);
expect(events.some((e) => e.kind === 'memory_invalidate')).toBe(true);
});
it('emits memory_update_call with counts when the LLM calls memory_update', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const memory = new WorkspaceMemory();
const ctx = makeContextWithEvents(workspace);
const client = new FakeClient([
[
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'x' }, { claim: 'y' }] } },
{ type: 'done' },
],
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, ctx, { workspaceMemory: memory });
const events = readEvents(workspace);
const muCall = events.find((e) => e.kind === 'memory_update_call');
expect(muCall).toBeDefined();
const payload = muCall?.payload as { counts: { factsAdded: number } };
expect(payload.counts.factsAdded).toBe(2);
});
it('emits watchdog_fire when no checklist tool is used in 5 iterations', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown }>> = [];
for (let i = 0; i < 6; i++) {
responses.push([
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
{ type: 'done' },
]);
}
responses.push([
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
]);
const client = new FakeClient(responses as never);
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));
const events = readEvents(workspace);
expect(events.some((e) => e.kind === 'watchdog_fire')).toBe(true);
});
it('all events share the same runId across one piece run', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
const ctx = makeContextWithEvents(workspace);
const client = new FakeClient([
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
await executeMovement(makeMovement(['Read']), 'task', client as never, ctx);
const events = readEvents(workspace);
expect(events.length).toBeGreaterThan(0);
const runIds = new Set(events.map((e) => e.runId));
expect(runIds.size).toBe(1);
expect(runIds.has('test-run')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Phase 4 (SSH Console): buildSystemPrompt screen injection
// ---------------------------------------------------------------------------
import { buildSystemPrompt, __setActiveSessionLookup, type HandoffContext } from './agent-loop.js';
describe('buildSystemPrompt console injection', () => {
afterEach(() => {
__setActiveSessionLookup(null);
});
function makeConsoleMovement(allowedTools: string[]): Movement {
return {
name: 'm1',
edit: false,
persona: 'p',
instruction: 'i',
allowedTools,
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
}
it('appends screen block when SshConsole* in allowed_tools and session exists', () => {
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: 'mock-screen-line-1\nmock-screen-line-2' }),
};
__setActiveSessionLookup((_tid: string) => fakeSession);
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleEnsure', 'SshConsoleSend', 'SshConsoleSnapshot']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
expect(sys).toContain('Console screen');
expect(sys).toContain('mock-screen-line-1');
expect(sys).toContain('mock-screen-line-2');
});
it('does NOT inject when piece does not allow console tools', () => {
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: 'mock-screen-X' }),
};
__setActiveSessionLookup((_tid: string) => fakeSession);
const sys = buildSystemPrompt(
makeConsoleMovement(['Read', 'Bash']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
expect(sys).not.toContain('Console screen');
expect(sys).not.toContain('mock-screen-X');
});
it('does NOT inject when no active session is registered for the task', () => {
__setActiveSessionLookup((_tid: string) => null);
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleSend']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
expect(sys).not.toContain('Console screen');
});
it('does NOT inject when taskId is missing (subtask without local_task binding)', () => {
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: 'mock-screen' }),
};
let lookedUp: string | null = null;
__setActiveSessionLookup((tid: string) => {
lookedUp = tid;
return fakeSession;
});
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleSnapshot']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
null,
);
expect(sys).not.toContain('Console screen');
expect(lookedUp).toBeNull();
});
it('truncates injected screen to the configured tail length', () => {
const allLines = Array.from({ length: 200 }, (_, i) => `line-${i + 1}`);
const fakeSession = {
cols: 80,
rows: 24,
snapshotScreen: () => ({ text: allLines.join('\n') }),
};
__setActiveSessionLookup((_tid: string) => fakeSession);
const sys = buildSystemPrompt(
makeConsoleMovement(['SshConsoleSend']),
1,
5,
[],
undefined,
null,
undefined,
undefined,
undefined,
't1',
);
// Default tail = 24 lines: should contain the last line but not very early lines.
expect(sys).toContain('line-200');
expect(sys).toContain('line-177'); // 200 - 24 + 1 = 177 (inclusive tail)
expect(sys).not.toContain('line-1\n');
expect(sys).not.toContain('line-50');
});
});
describe('buildSystemPrompt — handoff blocks', () => {
// buildSystemPrompt is pure — we can call it directly. Build a minimal
// Movement that won't trigger errors in the rendering pipeline.
const movement: Movement = {
name: 'investigate',
persona: 'investigator',
instruction: 'do the thing',
rules: [{ condition: 'done', next: 'plan' }],
allowedTools: [],
edit: false,
};
it('always includes the static Continue block', () => {
const prompt = buildSystemPrompt(movement);
expect(prompt).toContain('Continue 機能');
expect(prompt).toContain('output/ ファイルは次の piece でもそのまま参照されます');
});
it('omits the dynamic block when handoffContext is not provided', () => {
const prompt = buildSystemPrompt(movement);
expect(prompt).not.toContain('前 piece からの引き継ぎ');
});
it('includes the dynamic block with prev piece + result when handoffContext is provided', () => {
const handoff: HandoffContext = {
prevPiece: 'manual-writer',
prevResult: 'output/manual.md is ready, with 5 sections covering setup',
};
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
expect(prompt).toContain('前 piece からの引き継ぎ');
expect(prompt).toContain('manual-writer');
expect(prompt).toContain('output/manual.md is ready');
});
it('handles null prevResult gracefully', () => {
const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: null };
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
expect(prompt).toContain('前 piece からの引き継ぎ');
expect(prompt).toContain('前 piece は最終出力を残しませんでした');
});
it('truncates very long prevResult to ~2500 chars (head + tail)', () => {
// Construct: 3000 'A's + unique 'BBBBBMIDDLE' + 3000 'C's. After
// truncation: head=2000 of 'A', truncated marker, tail=last 500 of 'C'.
// The middle marker should be cut out.
const longResult = 'A'.repeat(3000) + 'BBBBBMIDDLE' + 'C'.repeat(3000);
const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: longResult };
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
expect(prompt).toContain('[truncated]');
expect(prompt).toContain('A'.repeat(100)); // head present
expect(prompt).toContain('C'.repeat(100)); // tail present
expect(prompt).not.toContain('BBBBBMIDDLE'); // middle was cut
});
});