import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import type { LLMEvent, ToolDef } from '../llm/openai-compat.js'; import type { ToolContext } from './tools/index.js'; import { stripThinkingTokens } from './agent-loop.js'; describe('stripThinkingTokens', () => { it('strips ... blocks', () => { expect(stripThinkingTokens('reasoning hereactual output')).toBe('actual output'); }); it('strips <|thinking|>... blocks', () => { expect(stripThinkingTokens('<|thinking|>deep thought<|/thinking|>result')).toBe('result'); }); it('strips gemma4 thought pattern', () => { expect(stripThinkingTokens('thought\n')).toBe(''); expect(stripThinkingTokens('thought hello')).toBe('hello'); }); it('strips ... blocks', () => { expect(stripThinkingTokens('internal reasoningvisible output')).toBe('visible output'); }); it('returns normal text unchanged', () => { expect(stripThinkingTokens('hello world')).toBe('hello world'); }); it('handles multiline thinking blocks', () => { expect(stripThinkingTokens('\nline1\nline2\noutput')).toBe('output'); }); }); const { executeToolMock, getToolDefsMock } = vi.hoisted(() => ({ executeToolMock: vi.fn(), getToolDefsMock: vi.fn(), })); vi.mock('./tools/index.js', () => ({ executeTool: executeToolMock, getToolDefs: getToolDefsMock, })); import { executeMovement, type Movement } from './agent-loop.js'; import { ToolResultCache } from './context/tool-result-cache.js'; import { WorkspaceMemory } from './context/workspace-memory.js'; function makeMovement(allowedTools: string[]): Movement { return { name: 'execute', edit: true, persona: 'worker', instruction: 'Do the work.', allowedTools, rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; } function makeToolDefs(names: string[]): ToolDef[] { return names.map((name) => ({ type: 'function', function: { name, description: name, parameters: { type: 'object', properties: {}, required: [] }, }, })); } function makeContext(): ToolContext { return { workspacePath: '/tmp/agent-loop-test', editAllowed: true, }; } function deferred(): { promise: Promise; resolve: (value: T) => void } { let resolve!: (value: T) => void; const promise = new Promise((res) => { resolve = res; }); return { promise, resolve }; } async function waitFor(predicate: () => boolean, timeoutMs: number = 250): Promise { const start = Date.now(); while (!predicate()) { if (Date.now() - start > timeoutMs) { throw new Error('Timed out waiting for condition'); } await new Promise((resolve) => setTimeout(resolve, 0)); } } class FakeClient { private index = 0; readonly calls: Array<{ messages: unknown; tools?: unknown }> = []; constructor(private readonly responses: LLMEvent[][]) {} async *chat(messages: unknown, tools?: unknown, _signal?: AbortSignal): AsyncGenerator { this.calls.push({ messages, tools }); const response = this.responses[this.index++] ?? []; for (const event of response) { yield event; } } } describe('executeMovement parallel tool execution', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('runs consecutive safe tool calls in parallel', async () => { const readGate = deferred<{ output: string; isError: boolean }>(); const globGate = deferred<{ output: string; isError: boolean }>(); const started: string[] = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob'])); executeToolMock.mockImplementation(async (name: string) => { started.push(name); if (name === 'Read') return readGate.promise; if (name === 'Glob') return globGate.promise; return { output: 'ok', isError: false }; }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } }, { type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext()); await waitFor(() => started.length === 2); expect(started).toEqual(['Read', 'Glob']); readGate.resolve({ output: 'read ok', isError: false }); globGate.resolve({ output: 'glob ok', isError: false }); const result = await runPromise; expect(result.next).toBe('COMPLETE'); expect(executeToolMock).toHaveBeenCalledTimes(2); }); it('keeps side-effecting tools sequential as a barrier', async () => { const readGate = deferred<{ output: string; isError: boolean }>(); const writeGate = deferred<{ output: string; isError: boolean }>(); const started: string[] = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Write'])); executeToolMock.mockImplementation(async (name: string) => { started.push(name); if (name === 'Read') return readGate.promise; if (name === 'Write') return writeGate.promise; return { output: 'ok', isError: false }; }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } }, { type: 'tool_use', id: 'write-1', name: 'Write', input: { file_path: 'output/a.txt', content: 'x' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); const runPromise = executeMovement(makeMovement(['Read', 'Write']), 'task', client as never, makeContext()); await waitFor(() => started.length === 1); expect(started).toEqual(['Read']); readGate.resolve({ output: 'read ok', isError: false }); await waitFor(() => started.length === 2); expect(started).toEqual(['Read', 'Write']); writeGate.resolve({ output: 'write ok', isError: false }); const result = await runPromise; expect(result.next).toBe('COMPLETE'); expect(executeToolMock).toHaveBeenCalledTimes(2); }); it('executes regular tools before transition even if transition appears mid-batch', async () => { const readGate = deferred<{ output: string; isError: boolean }>(); const globGate = deferred<{ output: string; isError: boolean }>(); const started: string[] = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob'])); executeToolMock.mockImplementation(async (name: string) => { started.push(name); if (name === 'Read') return readGate.promise; if (name === 'Glob') return globGate.promise; return { output: 'glob ok', isError: false }; }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } }, { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } }, { type: 'done' }, ], ]); const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext()); await waitFor(() => started.length === 2); expect(started).toEqual(['Read', 'Glob']); readGate.resolve({ output: 'read ok', isError: false }); globGate.resolve({ output: 'glob ok', isError: false }); const result = await runPromise; expect(result.next).toBe('COMPLETE'); expect(executeToolMock).toHaveBeenCalledTimes(2); }); it('emits text, preview, and tool callbacks in order', async () => { const textChunks: string[] = []; const previews: Array<{ movement: string; preview: string }> = []; const toolUses: Array<{ name: string; input: Record }> = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'read ok', isError: false }); const client = new FakeClient([ [ { type: 'text', text: 'first ' }, { type: 'text', text: 'second' }, { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { callbacks: { onText: (text) => textChunks.push(text), onTextPreview: (movementName, preview) => previews.push({ movement: movementName, preview }), onToolUse: (toolName, input) => toolUses.push({ name: toolName, input }), }, }, ); expect(result.next).toBe('COMPLETE'); expect(textChunks).toEqual(['first ', 'second']); expect(previews).toEqual([{ movement: 'execute', preview: 'first second' }]); expect(toolUses).toContainEqual({ name: 'Read', input: { file_path: 'input/a.txt' } }); }); it('fires onToolResult callback after tool execution', async () => { const toolResults: Array<{ name: string; isError: boolean }> = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'read ok', isError: false }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { callbacks: { onToolResult: (name, info) => toolResults.push({ name, isError: info.isError }), }, }, ); expect(toolResults).toContainEqual({ name: 'Read', isError: false }); }); it('fires onMemoryCheckpoint callback after regular tool use', async () => { const checkpoints: number[] = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'read ok', isError: false }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { callbacks: { onMemoryCheckpoint: (count) => checkpoints.push(count), }, }, ); expect(checkpoints).toEqual([1]); }); it('triggers force_transition when context manager signals exhaustion', async () => { const { ContextManager } = await import('./context-manager.js'); const cm = new ContextManager({ thresholds: [{ ratio: 0.005, action: 'force_transition' }], limitTokens: 10_000, }); getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done', usage: { prompt_tokens: 80, completion_tokens: 10 } }, ], ]); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { contextManager: cm }, ); expect(result.next).toBe('COMPLETE'); expect(result.output).toContain('Context limit reached'); }); it('fires onContextAction callback when context threshold crossed', async () => { const { ContextManager } = await import('./context-manager.js'); const cm = new ContextManager({ thresholds: [{ ratio: 0.005, action: 'warn' }], limitTokens: 10_000, }); const contextActions: Array<{ type: string }> = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done', usage: { prompt_tokens: 60, completion_tokens: 10 } }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { callbacks: { onContextAction: (action) => contextActions.push({ type: action.type }), }, contextManager: cm, }, ); expect(contextActions).toContainEqual({ type: 'warn' }); }); it('fires onContextUpdate callback after each LLM usage update', async () => { const { ContextManager } = await import('./context-manager.js'); const cm = new ContextManager({ limitTokens: 100_000 }); const updates: Array<{ promptTokens: number; limitTokens: number }> = []; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done', usage: { prompt_tokens: 1234, completion_tokens: 10 } }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { callbacks: { onContextUpdate: (payload) => updates.push(payload), }, contextManager: cm, }, ); expect(updates.length).toBeGreaterThanOrEqual(1); expect(updates).toContainEqual({ promptTokens: 0, limitTokens: 100_000 }); expect(updates).toContainEqual({ promptTokens: 1234, limitTokens: 100_000 }); }); it('compacts oversized tool results before the next LLM request', async () => { const { ContextManager } = await import('./context-manager.js'); const cm = new ContextManager({ limitTokens: 20_000 }); const hugeBase64 = ``; getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: hugeBase64, isError: false }); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/page.html' } }, { type: 'done', usage: { prompt_tokens: 1000, completion_tokens: 10 } }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done', usage: { prompt_tokens: 2000, completion_tokens: 10 } }, ], ]); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { contextManager: cm }, ); expect(result.next).toBe('COMPLETE'); expect(client.calls).toHaveLength(2); const secondMessages = client.calls[1]!.messages as Array<{ role: string; content?: string }>; const toolMessage = secondMessages.find((message) => message.role === 'tool'); expect(toolMessage?.content).toContain('[Tool result omitted before LLM request]'); expect(toolMessage?.content).not.toContain(hugeBase64); // The agent should also receive a user-facing instruction to retry with a // narrower scope, instead of silently dropping content. const feedbackMessage = secondMessages.find( (message) => message.role === 'user' && message.content?.includes('LLM コンテキストに入れられませんでした'), ); expect(feedbackMessage).toBeDefined(); expect(feedbackMessage?.content).toMatch(/Read\(offset\/limit\)|Grep/); }); it('dedupes duplicate file reads before compacting tool results', async () => { const { ContextManager } = await import('./context-manager.js'); const cm = new ContextManager({ limitTokens: 40_000 }); // 80% guard => 32_000 tokens. Each large ASCII read is roughly 20k tokens. // Two duplicate reads exceed the guard without dedup; after dedup, only // the latest read survives and the request fits. getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const largeContent = 'X'.repeat(72_000); executeToolMock.mockResolvedValue({ output: largeContent, isError: false }); const client = new FakeClient([ [ { type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: '/dup.ts' } }, { type: 'done', usage: { prompt_tokens: 500, completion_tokens: 10 } }, ], [ { type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: '/dup.ts' } }, { type: 'done', usage: { prompt_tokens: 18_000, completion_tokens: 10 } }, ], [ { type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { contextManager: cm }, ); expect(result.next).toBe('COMPLETE'); expect(client.calls.length).toBe(3); // The 3rd LLM call should see the older Read replaced with a placeholder const thirdCallMessages = client.calls[2]!.messages as Array<{ role: string; content?: string }>; const placeholderMessages = thirdCallMessages.filter( (m) => m.role === 'tool' && typeof m.content === 'string' && m.content.includes('Duplicate Read of /dup.ts'), ); expect(placeholderMessages.length).toBe(1); // The latest Read should still carry the full content const fullReadMessages = thirdCallMessages.filter( (m) => m.role === 'tool' && typeof m.content === 'string' && m.content === largeContent, ); expect(fullReadMessages.length).toBe(1); }); it('summarizes old turns when dedup+compact alone cannot fit prompt', async () => { const { ContextManager } = await import('./context-manager.js'); const { SUMMARY_MARKER_PREFIX } = await import('./context/history-compactor.js'); // Tight context, distinct Bash outputs (so dedup cannot help) each below // LARGE_TOOL_RESULT_TOKENS (so compaction cannot help either) — only // history summarization can keep the conversation going. // Phase 6c expanded the system prompt by ~200 chars (memory_update // guidance + new tool definition); we bump the test limit a bit so // summarization still fires on the 5th Bash turn rather than the 4th, // matching the FakeClient slot layout below. const cm = new ContextManager({ limitTokens: 35_000 }); getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash'])); executeToolMock.mockResolvedValue({ output: 'B'.repeat(18_000), isError: false }); // FakeClient.chat() walks a single response list, so the isolated // summarization LLM call (runIsolatedLlm) consumes the next slot. Slot 5 // is the summary response; slot 6 is the post-summary transition. const client = new FakeClient([ [{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo 1' } }, { type: 'done', usage: { prompt_tokens: 1_000, completion_tokens: 5 } }], [{ type: 'tool_use', id: 'b2', name: 'Bash', input: { command: 'echo 2' } }, { type: 'done', usage: { prompt_tokens: 5_500, completion_tokens: 5 } }], [{ type: 'tool_use', id: 'b3', name: 'Bash', input: { command: 'echo 3' } }, { type: 'done', usage: { prompt_tokens: 10_000, completion_tokens: 5 } }], [{ type: 'tool_use', id: 'b4', name: 'Bash', input: { command: 'echo 4' } }, { type: 'done', usage: { prompt_tokens: 15_000, completion_tokens: 5 } }], [{ type: 'tool_use', id: 'b5', name: 'Bash', input: { command: 'echo 5' } }, { type: 'done', usage: { prompt_tokens: 20_000, completion_tokens: 5 } }], [{ type: 'text', text: '## ゴール\nrun bash commands\n## 進捗\nDone: 5 commands' }, { type: 'done' }], [ { type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ], ]); const result = await executeMovement( makeMovement(['Bash']), 'task', client as never, makeContext(), { contextManager: cm }, ); expect(result.next).toBe('COMPLETE'); const finalMessages = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: string }>; const markerMessages = finalMessages.filter( (m) => m.role === 'user' && typeof m.content === 'string' && m.content.startsWith(SUMMARY_MARKER_PREFIX), ); // Phase 6c expanded the system prompt; the exact iteration where // summarization fires shifted slightly. The invariant we care about // is "summarization fired at least once before completion". expect(markerMessages.length).toBeGreaterThanOrEqual(1); }); it('aborts when initial prompt is oversized and defaultNext is terminal', async () => { const { ContextManager } = await import('./context-manager.js'); const cm = new ContextManager({ limitTokens: 1_000 }); getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); // The guard will fail before the main LLM is called, but the // last-resort summary path uses runIsolatedLlm (also via FakeClient), // so we provide one response for that summary call. const client = new FakeClient([ [{ type: 'text', text: '### Status\nstuck on overflow' }, { type: 'done' }], ]); const result = await executeMovement( makeMovement(['Read']), 'x'.repeat(10_000), client as never, makeContext(), { contextManager: cm }, ); // A terminal fallback would report a false completion, so overflow aborts. expect(result.next).toBe('ABORT'); expect(result.abortCode).toBe('context_overflow'); expect(result.output).toContain('LLM request blocked before send'); // Only the isolated summary call should have happened — no main LLM call expect(executeToolMock).not.toHaveBeenCalled(); }); it('falls back to ABORT when oversized prompt has no defaultNext to force-transition to', async () => { const { ContextManager } = await import('./context-manager.js'); const cm = new ContextManager({ limitTokens: 1_000 }); getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [{ type: 'text', text: 'should not be called' }, { type: 'done' }], ]); // Movement without defaultNext const movement: Movement = { name: 'execute', edit: true, persona: 'worker', instruction: 'Do the work.', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], }; const result = await executeMovement( movement, 'x'.repeat(10_000), client as never, makeContext(), { contextManager: cm }, ); expect(result.next).toBe('ABORT'); expect(result.output).toContain('LLM request blocked before send'); expect(executeToolMock).not.toHaveBeenCalled(); }); it('returns ABORT immediately when cancelSignal is already aborted', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done' }, ], ]); const controller = new AbortController(); controller.abort(); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { cancelSignal: controller.signal }, ); expect(result.next).toBe('ABORT'); expect(result.output).toContain('cancelled'); expect(executeToolMock).not.toHaveBeenCalled(); }); it('aborts after text-only responses without transition calls', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); // LLM returns text-only responses (no tool calls, no transition) 4 times // MAX_TEXT_ONLY_RETRIES is 3, so after 3 text-only responses it should ABORT const client = new FakeClient([ [{ type: 'text', text: 'I will do the work' }, { type: 'done' }], [{ type: 'text', text: 'Still working on it' }, { type: 'done' }], [{ type: 'text', text: 'Almost done' }, { type: 'done' }], // Should not reach this [{ type: 'text', text: 'Never reached' }, { type: 'done' }], ]); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), ); expect(result.next).toBe('ABORT'); expect(executeToolMock).not.toHaveBeenCalled(); }); it('resets text-only counter when tool calls happen in between', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); // Alternate: text-only → tool → text-only → tool → text-only → tool → complete // Without the fix, the 3 text-only responses would trigger abort. // With the fix, each tool call resets the counter so it never reaches 3 consecutive. const client = new FakeClient([ [{ type: 'text', text: 'Thinking about the answer...' }, { type: 'done' }], [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done' }, ], [{ type: 'text', text: 'Found some info, analyzing...' }, { type: 'done' }], [ { type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'b.txt' } }, { type: 'done' }, ], [{ type: 'text', text: 'Almost done with analysis' }, { type: 'done' }], [ { type: 'tool_use', id: 'complete-1', name: 'complete', input: { status: 'success', result: 'Done' } }, { type: 'done' }, ], ]); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), ); expect(result.next).not.toBe('ABORT'); expect(executeToolMock).toHaveBeenCalled(); }); it('aborts when maxIterations is exceeded', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); // LLM keeps using tools but never calls transition const toolResponses = Array.from({ length: 4 }, () => [ { type: 'tool_use' as const, id: `read-${Math.random()}`, name: 'Read', input: { file_path: 'a.txt' } }, { type: 'done' as const }, ]); const client = new FakeClient(toolResponses); const result = await executeMovement( makeMovement(['Read']), 'task', client as never, makeContext(), { maxIterations: 2 }, ); expect(result.next).toBe('ABORT'); expect(result.output).toContain('iteration'); }); }); describe('executeMovement cross-movement Read cache', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('returns a cached Read result when a later movement reads the same file', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const fileContent = 'const answer = 42;\n'; executeToolMock.mockImplementation(async (name: string, input: Record) => { if (name === 'Read' && input['file_path'] === 'foo.ts') { return { output: fileContent, isError: false }; } return { output: 'unexpected', isError: true }; }); const cache = new ToolResultCache(); const movementA: Movement = { name: 'investigate', edit: false, persona: 'investigator', instruction: 'Look at foo.ts.', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'plan' }], defaultNext: 'plan', }; const clientA = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'transition', input: { next_step: 'plan', summary: 'foo.ts read' } }, { type: 'done' }, ], ]); const resultA = await executeMovement(movementA, 'task', clientA as never, makeContext(), { toolResultCache: cache, }); expect(resultA.next).toBe('plan'); expect(executeToolMock).toHaveBeenCalledTimes(1); expect(cache.size()).toBe(1); const observedToolResults: string[] = []; const movementB: Movement = { name: 'plan', edit: false, persona: 'planner', instruction: 'Plan based on foo.ts.', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [ { type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-2', name: 'complete', input: { status: 'success', result: 'used cached read' } }, { type: 'done' }, ], ]); const resultB = await executeMovement(movementB, 'task', clientB as never, makeContext(), { toolResultCache: cache, callbacks: { onToolResult: (_name, info) => observedToolResults.push(info.result), }, }); expect(resultB.next).toBe('COMPLETE'); // executeTool was NOT invoked again — the cache short-circuited the second Read. expect(executeToolMock).toHaveBeenCalledTimes(1); // The LLM saw a cached-tagged result, not the raw content. expect(observedToolResults.some((r) => r.startsWith('[cached: Read foo.ts from movement investigate at '))).toBe(true); expect(observedToolResults.some((r) => r.endsWith(fileContent))).toBe(true); }); it('does not cache error results', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ENOENT: no such file', isError: true }); const cache = new ToolResultCache(); const movement: Movement = { name: 'investigate', edit: false, persona: 'investigator', instruction: 'Look at missing.ts.', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const client = new FakeClient([ [ { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'missing.ts' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'tried' } }, { type: 'done' }, ], ]); await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(0); }); it('skips caching tools outside the cacheable allowlist (Bash etc.)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash'])); executeToolMock.mockResolvedValue({ output: 'shell ok', isError: false }); const cache = new ToolResultCache(); const movement: Movement = { name: 'investigate', edit: false, persona: 'investigator', instruction: 'Run.', allowedTools: ['Bash'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const client = new FakeClient([ [ { type: 'tool_use', id: 'b-1', name: 'Bash', input: { command: 'echo hi' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'ran' } }, { type: 'done' }, ], ]); await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(0); }); }); describe('executeMovement Phase 2 cache invalidation', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('invalidates cached Read after a successful Edit on the same path', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit'])); const reads: number[] = []; let version = 1; executeToolMock.mockImplementation(async (name: string, input: Record) => { if (name === 'Read' && input['file_path'] === 'foo.ts') { reads.push(version); return { output: `version=${version}`, isError: false }; } if (name === 'Edit' && input['file_path'] === 'foo.ts') { version++; return { output: 'edited', isError: false }; } return { output: 'unexpected', isError: true }; }); const cache = new ToolResultCache(); const movementA: Movement = { name: 'investigate', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 'read' } }, { type: 'done' }], ]); await executeMovement(movementA, 'task', clientA as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(1); const movementB: Movement = { name: 'execute', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Edit'], rules: [{ condition: 'done', next: 'verify' }], defaultNext: 'verify', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'transition', input: { next_step: 'verify', summary: 'edited' } }, { type: 'done' }], ]); await executeMovement(movementB, 'task', clientB as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(0); // Edit evicted the Read entry const observed: string[] = []; const movementC: Movement = { name: 'verify', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientC = new FakeClient([ [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 't3', name: 'complete', input: { status: 'success', result: 're-read' } }, { type: 'done' }], ]); await executeMovement(movementC, 'task', clientC as never, makeContext(), { toolResultCache: cache, callbacks: { onToolResult: (_n, info) => observed.push(info.result) }, }); // Re-read happened (no cache hit) and saw the post-edit version. expect(reads).toEqual([1, 2]); expect(observed.some((r) => r === 'version=2')).toBe(true); expect(observed.some((r) => r.startsWith('[cached:'))).toBe(false); }); it('invalidates only the affected path; other cached files survive', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit'])); const callLog: Array<{ name: string; path?: string }> = []; executeToolMock.mockImplementation(async (name: string, input: Record) => { callLog.push({ name, path: input['file_path'] as string | undefined }); if (name === 'Read') return { output: `${input['file_path']} body`, isError: false }; if (name === 'Edit') return { output: 'edited', isError: false }; return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const movA: Movement = { name: 'investigate', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }], ]); await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(2); const movB: Movement = { name: 'execute', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Edit', 'Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const observed: string[] = []; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }], [ { type: 'tool_use', id: 'r3', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'tool_use', id: 'r4', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }, ], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache, callbacks: { onToolResult: (_n, info) => observed.push(info.result) }, }); const fooReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'foo.ts').length; const barReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'bar.ts').length; expect(fooReadCount).toBe(2); // re-read after Edit expect(barReadCount).toBe(1); // bar still cached expect(observed.some((r) => r.startsWith('[cached: Read bar.ts'))).toBe(true); expect(observed.some((r) => r.startsWith('[cached: Read foo.ts'))).toBe(false); }); it('Bash invalidates every file-derived entry conservatively', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Bash'])); let readCalls = 0; executeToolMock.mockImplementation(async (name: string) => { if (name === 'Read') { readCalls++; return { output: `read ${readCalls}`, isError: false }; } if (name === 'Bash') return { output: 'shell ok', isError: false }; return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const movA: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }], ]); await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(2); const movB: Movement = { name: 'execute', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Bash'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo hi' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(0); // every Read evicted }); it('does NOT invalidate when Edit/Write/Bash returned an error', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit'])); let editAttempts = 0; executeToolMock.mockImplementation(async (name: string) => { if (name === 'Read') return { output: 'body', isError: false }; if (name === 'Edit') { editAttempts++; return { output: 'permission denied', isError: true }; } return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const movA: Movement = { name: 'investigate', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }], ]); await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache }); expect(cache.size()).toBe(1); const movB: Movement = { name: 'execute', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Edit'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache }); expect(editAttempts).toBe(1); expect(cache.size()).toBe(1); // failed Edit kept the cache intact }); }); describe('executeMovement Phase 3 WorkspaceMemory', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('persists memory_update from one movement and exposes it in the next system prompt', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'body', isError: false }); const memory = new WorkspaceMemory(); const movementA: Movement = { name: 'investigate', edit: false, persona: 'investigator', instruction: 'Find the bug.', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'plan' }], defaultNext: 'plan', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [ { type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 'investigated', memory_update: { facts: [ { claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'], confidence: 'high' }, ], decisions: [{ text: 'patch foo.ts directly', evidence_paths: ['foo.ts'] }], do_not_repeat: ['re-read foo.ts unless evidence breaks'], }, }, }, { type: 'done' }, ], ]); await executeMovement(movementA, 'task', clientA as never, makeContext(), { workspaceMemory: memory }); expect(memory.size().facts).toBe(1); expect(memory.size().decisions).toBe(1); expect(memory.size().doNotRepeat).toBe(1); const movementB: Movement = { name: 'plan', edit: false, persona: 'planner', instruction: 'Plan the fix.', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'planned' } }, { type: 'done' }], ]); await executeMovement(movementB, 'task', clientB as never, makeContext(), { workspaceMemory: memory }); const systemMsg = clientB.calls[0]?.messages as Array<{ role: string; content: string }>; expect(systemMsg[0]!.role).toBe('system'); expect(systemMsg[0]!.content).toContain('## これまでに蓄積した観測'); expect(systemMsg[0]!.content).toContain('foo.ts uses bar()'); expect(systemMsg[0]!.content).toContain('patch foo.ts directly'); expect(systemMsg[0]!.content).toContain('re-read foo.ts unless evidence breaks'); expect(systemMsg[0]!.content).toContain('memory は再調査禁止の根拠ではなく'); }); it('omits memory section when memory is empty', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const movement: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const client = new FakeClient([ [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory }); const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>; expect(systemMsg[0]!.content).not.toContain('## これまでに蓄積した観測'); }); it('Edit invalidates a fact whose evidence_paths includes the edited file', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit'])); executeToolMock.mockImplementation(async (name: string) => { if (name === 'Read') return { output: 'body', isError: false }; if (name === 'Edit') return { output: 'edited', isError: false }; return { output: 'x', isError: true }; }); const memory = new WorkspaceMemory(); const movA: Movement = { name: 'investigate', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [ { type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 'investigated', memory_update: { facts: [ { claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'] }, { claim: 'unrelated truth', evidence_paths: ['bar.ts'] }, ], }, }, }, { type: 'done' }, ], ]); await executeMovement(movA, 'task', clientA as never, makeContext(), { workspaceMemory: memory }); expect(memory.snapshot().facts).toHaveLength(2); const movB: Movement = { name: 'execute', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Edit'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'edited' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, makeContext(), { workspaceMemory: memory }); const snapshotAfter = memory.snapshot(); expect(snapshotAfter.facts).toHaveLength(1); expect(snapshotAfter.facts[0]!.claim).toBe('unrelated truth'); }); it('still works when piece submits no memory_update (backward compat)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'body', isError: false }); const memory = new WorkspaceMemory(); const movement: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const client = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'just summary' } }, { type: 'done' }], ]); const result = await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory }); expect(result.next).toBe('COMPLETE'); expect(memory.size().facts).toBe(0); }); }); describe('executeMovement Phase 4 cache extension', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('caches Grep results and serves them to a later movement', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep'])); let grepCalls = 0; executeToolMock.mockImplementation(async (name: string) => { if (name === 'Grep') { grepCalls++; return { output: 'matches in src/foo.ts:1\n', isError: false }; } return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const movA: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Grep'], rules: [{ condition: 'done', next: 'plan' }], defaultNext: 'plan', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }], ]); await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache }); expect(grepCalls).toBe(1); const observed: string[] = []; const movB: Movement = { name: 'plan', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Grep'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache, callbacks: { onToolResult: (_n, info) => observed.push(info.result) }, }); expect(grepCalls).toBe(1); expect(observed.some((r) => r.startsWith('[cached: Grep foo'))).toBe(true); }); it('Edit invalidates ALL search entries (Grep) regardless of scope', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep', 'Edit'])); let grepCalls = 0; executeToolMock.mockImplementation(async (name: string) => { if (name === 'Grep') { grepCalls++; return { output: `match v${grepCalls}`, isError: false }; } if (name === 'Edit') return { output: 'edited', isError: false }; return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const movA: Movement = { name: 'investigate', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Grep'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }], ]); await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache }); const movB: Movement = { name: 'execute', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Edit', 'Grep'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'unrelated.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }], [{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache }); expect(grepCalls).toBe(2); }); it('caches WebFetch and survives Edit invalidation', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['WebFetch', 'Edit'])); let fetchCalls = 0; executeToolMock.mockImplementation(async (name: string) => { if (name === 'WebFetch') { fetchCalls++; return { output: 'page body', isError: false }; } if (name === 'Edit') return { output: 'edited', isError: false }; return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const movA: Movement = { name: 'investigate', edit: true, persona: 'p', instruction: 'i', allowedTools: ['WebFetch'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'w1', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }], ]); await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache }); const movB: Movement = { name: 'execute', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Edit', 'WebFetch'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }], [{ type: 'tool_use', id: 'w2', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache }); expect(fetchCalls).toBe(1); }); it('caches Office tool results (ReadPdf) keyed by path + range', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['ReadPdf'])); let pdfCalls = 0; executeToolMock.mockImplementation(async (name: string) => { if (name === 'ReadPdf') { pdfCalls++; return { output: 'pdf body', isError: false }; } return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const mov: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['ReadPdf'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'p1', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(mov, 'task', clientA as never, makeContext(), { toolResultCache: cache }); expect(pdfCalls).toBe(1); const clientB = new FakeClient([ [{ type: 'tool_use', id: 'p2', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }], [{ type: 'tool_use', id: 'p3', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '4-6' } }, { type: 'done' }], [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(mov, 'task', clientB as never, makeContext(), { toolResultCache: cache }); expect(pdfCalls).toBe(2); }); }); describe('buildSystemPrompt checklist guidance', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('emits a "進捗管理" section with strengthened CreateChecklist guidance', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const movement: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const client = new FakeClient([ [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }], ]); await executeMovement(movement, 'task', client as never, makeContext()); const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>; expect(systemMsg[0]!.role).toBe('system'); expect(systemMsg[0]!.content).toContain('## 進捗管理'); expect(systemMsg[0]!.content).toContain('CreateChecklist'); expect(systemMsg[0]!.content).toContain('全 piece で常時利用可能'); // Stronger language (option A): explicit "3 個以上のツール" criterion + // "判断に迷ったら作る" + 2 回目以降への明示注意。 expect(systemMsg[0]!.content).toContain('3 個以上のツール呼び出し'); expect(systemMsg[0]!.content).toContain('判断に迷ったら作る'); expect(systemMsg[0]!.content).toContain('2 回目以降のやり取り'); }); }); describe('checklist watchdog (option B)', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('injects a one-shot reminder after 5 iterations without any checklist tool', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); // 6 iterations of plain Read → 1 iteration with complete. The watchdog // should fire on iteration 5 (0-indexed). const responses: Array> = []; for (let i = 0; i < 6; i++) { responses.push([ { type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } }, { type: 'done' }, ]); } responses.push([ { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ]); const client = new FakeClient(responses as never); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); // Find any user-role message containing the watchdog tag. const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>; const watchdogMessages = allMessagesAtFinalCall.filter( (m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'), ); expect(watchdogMessages.length).toBe(1); }); it('does NOT fire when CreateChecklist is called early', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'CreateChecklist'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); // CreateChecklist on iteration 0, then several Read, then complete. const responses: Array> = [ [{ type: 'tool_use', id: 'cc', name: 'CreateChecklist', input: { name: 'plan', items: [] } }, { type: 'done' }], ]; for (let i = 0; i < 7; i++) { responses.push([ { type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } }, { type: 'done' }, ]); } responses.push([ { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ]); const client = new FakeClient(responses as never); await executeMovement(makeMovement(['Read', 'CreateChecklist']), 'task', client as never, makeContext()); const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>; const watchdogMessages = allMessagesAtFinalCall.filter( (m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'), ); expect(watchdogMessages.length).toBe(0); }); }); // ============================================================ // Phase 6a — `complete` tool (§7.1–7.7 of design plan) // ============================================================ describe('Phase 6a: complete tool — §7.1 native basics', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('success status with result becomes the movement output (no chatter contamination)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'text', text: "I'll figure this out first" }, // chatter to be ignored { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'final answer' } }, { type: 'done' }, ], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.next).toBe('COMPLETE'); expect(result.output).toBe('final answer'); expect(result.output).not.toContain("I'll figure this out first"); }); it('aborted status routes via ABORT next', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted', abort_reason: 'tool unavailable' } }, { type: 'done' }, ], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.next).toBe('ABORT'); expect(result.output).toBe('tool unavailable'); }); it('needs_user_input routes via ASK next', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'needs_user_input', missing_info: 'which dataset?', why_no_default: 'multiple options' }, }, { type: 'done' }, ], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.next).toBe('ASK'); expect(result.output).toBe('which dataset?'); }); it('rejects success with empty result and forces retry (no accumulatedText fallback for native)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'text', text: 'this would have been the fallback' }, { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'real answer' } }, { type: 'done' }, ], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.next).toBe('COMPLETE'); expect(result.output).toBe('real answer'); // The fallback chatter must NOT have leaked into the output via retry. expect(result.output).not.toContain('this would have been the fallback'); }); it('rejects aborted without abort_reason and forces retry', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted' } }, { type: 'done' }], [{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'aborted', abort_reason: 'real reason' } }, { type: 'done' }], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.next).toBe('ABORT'); expect(result.output).toBe('real reason'); }); }); describe('Phase 6a: complete tool — §7.2 precedence rules (§2.5)', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('invalid native complete forces retry (no fallback)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ // First iteration: invalid (empty result) — must retry [ { type: 'text', text: 'this would have been the fallback' }, { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } }, { type: 'done' }, ], // Second iteration: LLM corrects with valid native [ { type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'native corrected' } }, { type: 'done' }, ], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.output).toBe('native corrected'); // accumulatedText must NOT have leaked into the output via retry expect(result.output).not.toContain('fallback'); }); it('two native completes with conflicting args → retry', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'A' } }, { type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'B' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'consistent' } }, { type: 'done' }, ], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.output).toBe('consistent'); }); it('two native completes with identical args → first one used', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'same answer' } }, { type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'same answer' } }, { type: 'done' }, ], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); expect(result.output).toBe('same answer'); }); }); describe('Phase 6a: complete tool — §7.4 regressions', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('non-terminal transition (movement-to-movement) still works', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const movement: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'plan' }], defaultNext: 'plan', }; const client = new FakeClient([ [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 'investigated' } }, { type: 'done' }], ]); const result = await executeMovement(movement, 'task', client as never, makeContext()); expect(result.next).toBe('plan'); expect(result.output).toBe('investigated'); }); it('transition({next_step:"COMPLETE"}) is rejected (Phase 6b)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); // FakeClient bypasses tool-schema validation, but the engine's // validateTransition still rejects unknown next values, so an LLM // emitting a legacy terminal transition gets a tool-result error and // must retry. This locks in the no-shim behavior. const client = new FakeClient([ [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'COMPLETE', summary: 'should not be accepted' }, }, { type: 'done' }], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'corrected' } }, { type: 'done' }], ]); const movement: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], // No COMPLETE in rules — legacy terminal cannot succeed. rules: [{ condition: 'fix needed', next: 'fix' }], }; const result = await executeMovement(movement, 'task', client as never, makeContext()); expect(result.output).toBe('corrected'); }); }); describe('Phase 6a: complete tool — §7.5 memory_update behavior', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('memory_update inside complete is applied to WorkspaceMemory (success)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const client = new FakeClient([ [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done', memory_update: { facts: [{ claim: 'X is Y', confidence: 'high' }] }, }, }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory }); expect(memory.size().facts).toBe(1); }); it('memory_update is NOT applied when complete is invalid and retried', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const client = new FakeClient([ // Invalid first attempt — memory_update must NOT commit [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '', memory_update: { facts: [{ claim: 'should not commit', confidence: 'high' }] }, }, }, { type: 'done' }], // Valid retry — different fact, must commit only this one [{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'fixed', memory_update: { facts: [{ claim: 'committed once', confidence: 'high' }] }, }, }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory }); expect(memory.size().facts).toBe(1); expect(memory.snapshot().facts[0]!.claim).toBe('committed once'); }); it('memory_update applied for status=aborted', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const client = new FakeClient([ [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted', abort_reason: 'tool unavailable', memory_update: { facts: [{ claim: 'lesson from failure', confidence: 'medium' }] }, }, }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory }); expect(memory.size().facts).toBe(1); }); }); describe('Phase 6a: complete tool — §7.7 v3 critical (Conditional Go) tests', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('all tool_use ids get a tool_result on retry (conversation history integrity)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const client = new FakeClient([ [ { type: 'tool_use', id: 'fail-1', name: 'complete', input: { status: 'success', result: '' } }, { type: 'tool_use', id: 'ignored-1', name: 'complete', input: { status: 'success', result: 'X' } }, { type: 'done' }, ], [ { type: 'tool_use', id: 'good', name: 'complete', input: { status: 'success', result: 'recovered' } }, { type: 'done' }, ], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext()); // Inspect the messages sent on the second call — every prior tool_use must have a tool_result. const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; tool_call_id?: string }>; const toolResultIds = secondCallMessages.filter((m) => m.role === 'tool').map((m) => m.tool_call_id); expect(toolResultIds).toContain('fail-1'); expect(toolResultIds).toContain('ignored-1'); }); }); // ============================================================ // Phase 6c — `memory_update` standalone tool // ============================================================ describe('Phase 6c: memory_update tool', () => { afterEach(() => { executeToolMock.mockReset(); getToolDefsMock.mockReset(); }); it('mid-movement memory_update commits before terminal selection (visible to LLM next iteration)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const client = new FakeClient([ // iteration 0: emit memory_update only [ { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'mid-movement observation' }], } }, { type: 'done' }, ], // iteration 1: terminal complete [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory }); expect(result.next).toBe('COMPLETE'); expect(memory.size().facts).toBe(1); expect(memory.snapshot().facts[0]!.claim).toBe('mid-movement observation'); }); it('returns "no changes committed" tool_result for empty payload', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const observed: string[] = []; const client = new FakeClient([ [ { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {} }, { type: 'done' }, ], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory, }); // The next iteration's messages include the tool_result for mu-1. const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; content?: unknown; tool_call_id?: string }>; const muResult = secondCallMessages.find((m) => m.role === 'tool' && m.tool_call_id === 'mu-1'); expect(muResult).toBeDefined(); expect(String(muResult?.content)).toMatch(/no changes committed/); expect(memory.size().facts).toBe(0); }); it('exact-claim duplicate within same iteration is merged (Codex Phase 6c §2.5)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const client = new FakeClient([ [ { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'X is Y', evidence_paths: ['a.ts'] }], } }, { type: 'tool_use', id: 'mu-2', name: 'memory_update', input: { facts: [{ claim: 'X is Y', evidence_paths: ['b.ts'] }], } }, { type: 'done' }, ], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory }); const snap = memory.snapshot(); expect(snap.facts).toHaveLength(1); // evidence_paths should be union-merged, not duplicated. expect(snap.facts[0]!.evidencePaths.sort()).toEqual(['a.ts', 'b.ts']); }); it('memory_update commits even when complete in the same iteration is invalid (retry path)', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const client = new FakeClient([ // iteration 0: memory_update + invalid complete (empty result) → retry [ { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'observation persists' }], } }, { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } }, { type: 'done' }, ], // iteration 1: corrected complete [{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory }); expect(result.next).toBe('COMPLETE'); // The memory_update from the FIRST iteration should still be committed, // even though the iteration retried due to invalid complete args. expect(memory.snapshot().facts.map((f) => f.claim)).toContain('observation persists'); }); it('memory_update is a META_TOOL — works even when allowed_tools is empty', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs([])); const memory = new WorkspaceMemory(); const client = new FakeClient([ [ { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'still works' }], } }, { type: 'done' }, ], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); const movement: Movement = { name: 'execute', edit: false, persona: 'p', instruction: 'i', allowedTools: [], // empty — yet memory_update is in the tool catalog rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory }); expect(memory.size().facts).toBe(1); }); }); // ============================================================ // Traceability T-1 — events.jsonl emission scenarios // ============================================================ import { mkdtempSync, rmSync, readFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { createFileEventLogger, parseEventLine, type EventBase } from '../progress/event-log.js'; function readEvents(workspace: string): EventBase[] { const path = join(workspace, 'logs', 'events.jsonl'); const lines = readFileSync(path, 'utf-8').trim().split('\n').filter((l) => l.length > 0); return lines.map((l) => { const parsed = parseEventLine(l); if (parsed.kind !== 'ok') throw new Error(`unexpected non-ok event line: ${l}`); return parsed.event; }); } describe('Traceability T-1: agent-loop emission', () => { let workspace: string; beforeEach(() => { workspace = mkdtempSync(join(tmpdir(), 'trace-emission-')); }); afterEach(() => { rmSync(workspace, { recursive: true, force: true }); }); function makeContextWithEvents(workspaceLocal: string): ToolContext { return { workspacePath: workspaceLocal, editAllowed: true, eventLogger: createFileEventLogger({ workspacePath: workspaceLocal, runId: 'test-run' }), }; } it('emits movement_start, tool_call/result pair, and movement_complete for a single Read flow', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'body', isError: false }); const client = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace)); const events = readEvents(workspace); const kinds = events.map((e) => e.kind); expect(kinds).toContain('movement_start'); expect(kinds).toContain('tool_call'); expect(kinds).toContain('tool_result'); expect(kinds).toContain('complete'); expect(kinds).toContain('movement_complete'); const call = events.find((e) => e.kind === 'tool_call' && e.llmToolCallId === 'r1'); const result = events.find((e) => e.kind === 'tool_result' && e.llmToolCallId === 'r1'); expect(call?.correlationId).toBeTruthy(); expect(result?.correlationId).toBe(call?.correlationId); expect((result?.payload as { cacheHit: boolean }).cacheHit).toBe(false); }); it('emits cache_set on first Read and cache_hit on second Read of the same file', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'body', isError: false }); const cache = new ToolResultCache(); const ctx = makeContextWithEvents(workspace); const movA: Movement = { name: 'investigate', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'plan' }], defaultNext: 'plan', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }], ]); await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache }); const movB: Movement = { name: 'plan', edit: false, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache }); const events = readEvents(workspace); expect(events.some((e) => e.kind === 'cache_set')).toBe(true); expect(events.some((e) => e.kind === 'cache_hit')).toBe(true); const cacheHit = events.find((e) => e.kind === 'cache_hit'); expect((cacheHit?.payload as { sourceMovement: string }).sourceMovement).toBe('investigate'); }); it('emits cache_invalidate and memory_invalidate after a successful Edit', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit'])); executeToolMock.mockImplementation(async (name: string) => { if (name === 'Read') return { output: 'body', isError: false }; if (name === 'Edit') return { output: 'edited', isError: false }; return { output: 'x', isError: true }; }); const cache = new ToolResultCache(); const memory = new WorkspaceMemory(); const ctx = makeContextWithEvents(workspace); const movA: Movement = { name: 'investigate', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Read'], rules: [{ condition: 'done', next: 'execute' }], defaultNext: 'execute', }; const clientA = new FakeClient([ [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }], [ { type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's', memory_update: { facts: [{ claim: 'foo.ts uses bar', evidence_paths: ['foo.ts'] }] }, } }, { type: 'done' }, ], ]); await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache, workspaceMemory: memory }); const movB: Movement = { name: 'execute', edit: true, persona: 'p', instruction: 'i', allowedTools: ['Edit'], rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; const clientB = new FakeClient([ [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache, workspaceMemory: memory }); const events = readEvents(workspace); expect(events.some((e) => e.kind === 'cache_invalidate')).toBe(true); expect(events.some((e) => e.kind === 'memory_invalidate')).toBe(true); }); it('emits memory_update_call with counts when the LLM calls memory_update', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const memory = new WorkspaceMemory(); const ctx = makeContextWithEvents(workspace); const client = new FakeClient([ [ { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'x' }, { claim: 'y' }] } }, { type: 'done' }, ], [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, ctx, { workspaceMemory: memory }); const events = readEvents(workspace); const muCall = events.find((e) => e.kind === 'memory_update_call'); expect(muCall).toBeDefined(); const payload = muCall?.payload as { counts: { factsAdded: number } }; expect(payload.counts.factsAdded).toBe(2); }); it('emits watchdog_fire when no checklist tool is used in 5 iterations', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); executeToolMock.mockResolvedValue({ output: 'ok', isError: false }); const responses: Array> = []; for (let i = 0; i < 6; i++) { responses.push([ { type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } }, { type: 'done' }, ]); } responses.push([ { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }, ]); const client = new FakeClient(responses as never); await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace)); const events = readEvents(workspace); expect(events.some((e) => e.kind === 'watchdog_fire')).toBe(true); }); it('all events share the same runId across one piece run', async () => { getToolDefsMock.mockResolvedValue(makeToolDefs(['Read'])); const ctx = makeContextWithEvents(workspace); const client = new FakeClient([ [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }], ]); await executeMovement(makeMovement(['Read']), 'task', client as never, ctx); const events = readEvents(workspace); expect(events.length).toBeGreaterThan(0); const runIds = new Set(events.map((e) => e.runId)); expect(runIds.size).toBe(1); expect(runIds.has('test-run')).toBe(true); }); }); // --------------------------------------------------------------------------- // Phase 4 (SSH Console): buildSystemPrompt screen injection // --------------------------------------------------------------------------- import { buildSystemPrompt, __setActiveSessionLookup, type HandoffContext } from './agent-loop.js'; describe('buildSystemPrompt console injection', () => { afterEach(() => { __setActiveSessionLookup(null); }); function makeConsoleMovement(allowedTools: string[]): Movement { return { name: 'm1', edit: false, persona: 'p', instruction: 'i', allowedTools, rules: [{ condition: 'done', next: 'COMPLETE' }], defaultNext: 'COMPLETE', }; } it('appends screen block when SshConsole* in allowed_tools and session exists', () => { const fakeSession = { cols: 80, rows: 24, snapshotScreen: () => ({ text: 'mock-screen-line-1\nmock-screen-line-2' }), }; __setActiveSessionLookup((_tid: string) => fakeSession); const sys = buildSystemPrompt( makeConsoleMovement(['SshConsoleEnsure', 'SshConsoleSend', 'SshConsoleSnapshot']), 1, 5, [], undefined, null, undefined, undefined, undefined, 't1', ); expect(sys).toContain('Console screen'); expect(sys).toContain('mock-screen-line-1'); expect(sys).toContain('mock-screen-line-2'); }); it('does NOT inject when piece does not allow console tools', () => { const fakeSession = { cols: 80, rows: 24, snapshotScreen: () => ({ text: 'mock-screen-X' }), }; __setActiveSessionLookup((_tid: string) => fakeSession); const sys = buildSystemPrompt( makeConsoleMovement(['Read', 'Bash']), 1, 5, [], undefined, null, undefined, undefined, undefined, 't1', ); expect(sys).not.toContain('Console screen'); expect(sys).not.toContain('mock-screen-X'); }); it('does NOT inject when no active session is registered for the task', () => { __setActiveSessionLookup((_tid: string) => null); const sys = buildSystemPrompt( makeConsoleMovement(['SshConsoleSend']), 1, 5, [], undefined, null, undefined, undefined, undefined, 't1', ); expect(sys).not.toContain('Console screen'); }); it('does NOT inject when taskId is missing (subtask without local_task binding)', () => { const fakeSession = { cols: 80, rows: 24, snapshotScreen: () => ({ text: 'mock-screen' }), }; let lookedUp: string | null = null; __setActiveSessionLookup((tid: string) => { lookedUp = tid; return fakeSession; }); const sys = buildSystemPrompt( makeConsoleMovement(['SshConsoleSnapshot']), 1, 5, [], undefined, null, undefined, undefined, undefined, null, ); expect(sys).not.toContain('Console screen'); expect(lookedUp).toBeNull(); }); it('truncates injected screen to the configured tail length', () => { const allLines = Array.from({ length: 200 }, (_, i) => `line-${i + 1}`); const fakeSession = { cols: 80, rows: 24, snapshotScreen: () => ({ text: allLines.join('\n') }), }; __setActiveSessionLookup((_tid: string) => fakeSession); const sys = buildSystemPrompt( makeConsoleMovement(['SshConsoleSend']), 1, 5, [], undefined, null, undefined, undefined, undefined, 't1', ); // Default tail = 24 lines: should contain the last line but not very early lines. expect(sys).toContain('line-200'); expect(sys).toContain('line-177'); // 200 - 24 + 1 = 177 (inclusive tail) expect(sys).not.toContain('line-1\n'); expect(sys).not.toContain('line-50'); }); }); describe('buildSystemPrompt — handoff blocks', () => { // buildSystemPrompt is pure — we can call it directly. Build a minimal // Movement that won't trigger errors in the rendering pipeline. const movement: Movement = { name: 'investigate', persona: 'investigator', instruction: 'do the thing', rules: [{ condition: 'done', next: 'plan' }], allowedTools: [], edit: false, }; it('always includes the static Continue block', () => { const prompt = buildSystemPrompt(movement); expect(prompt).toContain('Continue 機能'); expect(prompt).toContain('output/ ファイルは次の piece でもそのまま参照されます'); }); it('omits the dynamic block when handoffContext is not provided', () => { const prompt = buildSystemPrompt(movement); expect(prompt).not.toContain('前 piece からの引き継ぎ'); }); it('includes the dynamic block with prev piece + result when handoffContext is provided', () => { const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: 'output/manual.md is ready, with 5 sections covering setup', }; const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff); expect(prompt).toContain('前 piece からの引き継ぎ'); expect(prompt).toContain('manual-writer'); expect(prompt).toContain('output/manual.md is ready'); }); it('handles null prevResult gracefully', () => { const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: null }; const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff); expect(prompt).toContain('前 piece からの引き継ぎ'); expect(prompt).toContain('前 piece は最終出力を残しませんでした'); }); it('truncates very long prevResult to ~2500 chars (head + tail)', () => { // Construct: 3000 'A's + unique 'BBBBBMIDDLE' + 3000 'C's. After // truncation: head=2000 of 'A', truncated marker, tail=last 500 of 'C'. // The middle marker should be cut out. const longResult = 'A'.repeat(3000) + 'BBBBBMIDDLE' + 'C'.repeat(3000); const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: longResult }; const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff); expect(prompt).toContain('[truncated]'); expect(prompt).toContain('A'.repeat(100)); // head present expect(prompt).toContain('C'.repeat(100)); // tail present expect(prompt).not.toContain('BBBBBMIDDLE'); // middle was cut }); });