2369 lines
93 KiB
TypeScript
2369 lines
93 KiB
TypeScript
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||
import type { LLMEvent, ToolDef } from '../llm/openai-compat.js';
|
||
import type { ToolContext } from './tools/index.js';
|
||
import { stripThinkingTokens } from './agent-loop.js';
|
||
|
||
describe('stripThinkingTokens', () => {
|
||
it('strips <think>...</think> blocks', () => {
|
||
expect(stripThinkingTokens('<think>reasoning here</think>actual output')).toBe('actual output');
|
||
});
|
||
it('strips <|thinking|>...</|thinking|> blocks', () => {
|
||
expect(stripThinkingTokens('<|thinking|>deep thought<|/thinking|>result')).toBe('result');
|
||
});
|
||
it('strips gemma4 thought<channel|> pattern', () => {
|
||
expect(stripThinkingTokens('thought\n<channel|>')).toBe('');
|
||
expect(stripThinkingTokens('thought <channel|>hello')).toBe('hello');
|
||
});
|
||
it('strips <channel|>...<channel|> blocks', () => {
|
||
expect(stripThinkingTokens('<channel|>internal reasoning<channel|>visible output')).toBe('visible output');
|
||
});
|
||
it('returns normal text unchanged', () => {
|
||
expect(stripThinkingTokens('hello world')).toBe('hello world');
|
||
});
|
||
it('handles multiline thinking blocks', () => {
|
||
expect(stripThinkingTokens('<think>\nline1\nline2\n</think>output')).toBe('output');
|
||
});
|
||
});
|
||
|
||
const { executeToolMock, getToolDefsMock } = vi.hoisted(() => ({
|
||
executeToolMock: vi.fn(),
|
||
getToolDefsMock: vi.fn(),
|
||
}));
|
||
|
||
vi.mock('./tools/index.js', () => ({
|
||
executeTool: executeToolMock,
|
||
getToolDefs: getToolDefsMock,
|
||
}));
|
||
|
||
import { executeMovement, type Movement } from './agent-loop.js';
|
||
import { ToolResultCache } from './context/tool-result-cache.js';
|
||
import { WorkspaceMemory } from './context/workspace-memory.js';
|
||
|
||
function makeMovement(allowedTools: string[]): Movement {
|
||
return {
|
||
name: 'execute',
|
||
edit: true,
|
||
persona: 'worker',
|
||
instruction: 'Do the work.',
|
||
allowedTools,
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
}
|
||
|
||
function makeToolDefs(names: string[]): ToolDef[] {
|
||
return names.map((name) => ({
|
||
type: 'function',
|
||
function: {
|
||
name,
|
||
description: name,
|
||
parameters: { type: 'object', properties: {}, required: [] },
|
||
},
|
||
}));
|
||
}
|
||
|
||
function makeContext(): ToolContext {
|
||
return {
|
||
workspacePath: '/tmp/agent-loop-test',
|
||
editAllowed: true,
|
||
};
|
||
}
|
||
|
||
function deferred<T>(): { promise: Promise<T>; resolve: (value: T) => void } {
|
||
let resolve!: (value: T) => void;
|
||
const promise = new Promise<T>((res) => {
|
||
resolve = res;
|
||
});
|
||
return { promise, resolve };
|
||
}
|
||
|
||
async function waitFor(predicate: () => boolean, timeoutMs: number = 250): Promise<void> {
|
||
const start = Date.now();
|
||
while (!predicate()) {
|
||
if (Date.now() - start > timeoutMs) {
|
||
throw new Error('Timed out waiting for condition');
|
||
}
|
||
await new Promise((resolve) => setTimeout(resolve, 0));
|
||
}
|
||
}
|
||
|
||
class FakeClient {
|
||
private index = 0;
|
||
readonly calls: Array<{ messages: unknown; tools?: unknown }> = [];
|
||
|
||
constructor(private readonly responses: LLMEvent[][]) {}
|
||
|
||
async *chat(messages: unknown, tools?: unknown, _signal?: AbortSignal): AsyncGenerator<LLMEvent> {
|
||
this.calls.push({ messages, tools });
|
||
const response = this.responses[this.index++] ?? [];
|
||
for (const event of response) {
|
||
yield event;
|
||
}
|
||
}
|
||
}
|
||
|
||
describe('executeMovement parallel tool execution', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('runs consecutive safe tool calls in parallel', async () => {
|
||
const readGate = deferred<{ output: string; isError: boolean }>();
|
||
const globGate = deferred<{ output: string; isError: boolean }>();
|
||
const started: string[] = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
started.push(name);
|
||
if (name === 'Read') return readGate.promise;
|
||
if (name === 'Glob') return globGate.promise;
|
||
return { output: 'ok', isError: false };
|
||
});
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
|
||
{ type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());
|
||
|
||
await waitFor(() => started.length === 2);
|
||
expect(started).toEqual(['Read', 'Glob']);
|
||
|
||
readGate.resolve({ output: 'read ok', isError: false });
|
||
globGate.resolve({ output: 'glob ok', isError: false });
|
||
|
||
const result = await runPromise;
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(executeToolMock).toHaveBeenCalledTimes(2);
|
||
});
|
||
|
||
it('keeps side-effecting tools sequential as a barrier', async () => {
|
||
const readGate = deferred<{ output: string; isError: boolean }>();
|
||
const writeGate = deferred<{ output: string; isError: boolean }>();
|
||
const started: string[] = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Write']));
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
started.push(name);
|
||
if (name === 'Read') return readGate.promise;
|
||
if (name === 'Write') return writeGate.promise;
|
||
return { output: 'ok', isError: false };
|
||
});
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
|
||
{ type: 'tool_use', id: 'write-1', name: 'Write', input: { file_path: 'output/a.txt', content: 'x' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const runPromise = executeMovement(makeMovement(['Read', 'Write']), 'task', client as never, makeContext());
|
||
|
||
await waitFor(() => started.length === 1);
|
||
expect(started).toEqual(['Read']);
|
||
|
||
readGate.resolve({ output: 'read ok', isError: false });
|
||
await waitFor(() => started.length === 2);
|
||
expect(started).toEqual(['Read', 'Write']);
|
||
|
||
writeGate.resolve({ output: 'write ok', isError: false });
|
||
|
||
const result = await runPromise;
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(executeToolMock).toHaveBeenCalledTimes(2);
|
||
});
|
||
|
||
it('executes regular tools before transition even if transition appears mid-batch', async () => {
|
||
const readGate = deferred<{ output: string; isError: boolean }>();
|
||
const globGate = deferred<{ output: string; isError: boolean }>();
|
||
const started: string[] = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
started.push(name);
|
||
if (name === 'Read') return readGate.promise;
|
||
if (name === 'Glob') return globGate.promise;
|
||
return { output: 'glob ok', isError: false };
|
||
});
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());
|
||
|
||
await waitFor(() => started.length === 2);
|
||
expect(started).toEqual(['Read', 'Glob']);
|
||
|
||
readGate.resolve({ output: 'read ok', isError: false });
|
||
globGate.resolve({ output: 'glob ok', isError: false });
|
||
|
||
const result = await runPromise;
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(executeToolMock).toHaveBeenCalledTimes(2);
|
||
});
|
||
|
||
it('emits text, preview, and tool callbacks in order', async () => {
|
||
const textChunks: string[] = [];
|
||
const previews: Array<{ movement: string; preview: string }> = [];
|
||
const toolUses: Array<{ name: string; input: Record<string, unknown> }> = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'text', text: 'first ' },
|
||
{ type: 'text', text: 'second' },
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{
|
||
callbacks: {
|
||
onText: (text) => textChunks.push(text),
|
||
onTextPreview: (movementName, preview) => previews.push({ movement: movementName, preview }),
|
||
onToolUse: (toolName, input) => toolUses.push({ name: toolName, input }),
|
||
},
|
||
},
|
||
);
|
||
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(textChunks).toEqual(['first ', 'second']);
|
||
expect(previews).toEqual([{ movement: 'execute', preview: 'first second' }]);
|
||
expect(toolUses).toContainEqual({ name: 'Read', input: { file_path: 'input/a.txt' } });
|
||
});
|
||
|
||
it('fires onToolResult callback after tool execution', async () => {
|
||
const toolResults: Array<{ name: string; isError: boolean }> = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{
|
||
callbacks: {
|
||
onToolResult: (name, info) => toolResults.push({ name, isError: info.isError }),
|
||
},
|
||
},
|
||
);
|
||
|
||
expect(toolResults).toContainEqual({ name: 'Read', isError: false });
|
||
});
|
||
|
||
it('fires onMemoryCheckpoint callback after regular tool use', async () => {
|
||
const checkpoints: number[] = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{
|
||
callbacks: {
|
||
onMemoryCheckpoint: (count) => checkpoints.push(count),
|
||
},
|
||
},
|
||
);
|
||
|
||
expect(checkpoints).toEqual([1]);
|
||
});
|
||
|
||
it('triggers force_transition when context manager signals exhaustion', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const cm = new ContextManager({
|
||
thresholds: [{ ratio: 0.005, action: 'force_transition' }],
|
||
limitTokens: 10_000,
|
||
});
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done', usage: { prompt_tokens: 80, completion_tokens: 10 } },
|
||
],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{ contextManager: cm },
|
||
);
|
||
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(result.output).toContain('Context limit reached');
|
||
});
|
||
|
||
it('fires onContextAction callback when context threshold crossed', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const cm = new ContextManager({
|
||
thresholds: [{ ratio: 0.005, action: 'warn' }],
|
||
limitTokens: 10_000,
|
||
});
|
||
|
||
const contextActions: Array<{ type: string }> = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done', usage: { prompt_tokens: 60, completion_tokens: 10 } },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{
|
||
callbacks: {
|
||
onContextAction: (action) => contextActions.push({ type: action.type }),
|
||
},
|
||
contextManager: cm,
|
||
},
|
||
);
|
||
|
||
expect(contextActions).toContainEqual({ type: 'warn' });
|
||
});
|
||
|
||
it('fires onContextUpdate callback after each LLM usage update', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const cm = new ContextManager({ limitTokens: 100_000 });
|
||
|
||
const updates: Array<{ promptTokens: number; limitTokens: number }> = [];
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done', usage: { prompt_tokens: 1234, completion_tokens: 10 } },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{
|
||
callbacks: {
|
||
onContextUpdate: (payload) => updates.push(payload),
|
||
},
|
||
contextManager: cm,
|
||
},
|
||
);
|
||
|
||
expect(updates.length).toBeGreaterThanOrEqual(1);
|
||
expect(updates).toContainEqual({ promptTokens: 0, limitTokens: 100_000 });
|
||
expect(updates).toContainEqual({ promptTokens: 1234, limitTokens: 100_000 });
|
||
});
|
||
|
||
it('compacts oversized tool results before the next LLM request', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const cm = new ContextManager({ limitTokens: 20_000 });
|
||
const hugeBase64 = `<html><img src="data:text/html;base64,${'A'.repeat(80_000)}"></html>`;
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: hugeBase64, isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/page.html' } },
|
||
{ type: 'done', usage: { prompt_tokens: 1000, completion_tokens: 10 } },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done', usage: { prompt_tokens: 2000, completion_tokens: 10 } },
|
||
],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{ contextManager: cm },
|
||
);
|
||
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(client.calls).toHaveLength(2);
|
||
const secondMessages = client.calls[1]!.messages as Array<{ role: string; content?: string }>;
|
||
const toolMessage = secondMessages.find((message) => message.role === 'tool');
|
||
expect(toolMessage?.content).toContain('[Tool result omitted before LLM request]');
|
||
expect(toolMessage?.content).not.toContain(hugeBase64);
|
||
// The agent should also receive a user-facing instruction to retry with a
|
||
// narrower scope, instead of silently dropping content.
|
||
const feedbackMessage = secondMessages.find(
|
||
(message) => message.role === 'user' && message.content?.includes('LLM コンテキストに入れられませんでした'),
|
||
);
|
||
expect(feedbackMessage).toBeDefined();
|
||
expect(feedbackMessage?.content).toMatch(/Read\(offset\/limit\)|Grep/);
|
||
});
|
||
|
||
it('dedupes duplicate file reads before compacting tool results', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const cm = new ContextManager({ limitTokens: 40_000 });
|
||
// 80% guard => 32_000 tokens. Each large ASCII read is roughly 20k tokens.
|
||
// Two duplicate reads exceed the guard without dedup; after dedup, only
|
||
// the latest read survives and the request fits.
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
|
||
const largeContent = 'X'.repeat(72_000);
|
||
executeToolMock.mockResolvedValue({ output: largeContent, isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: '/dup.ts' } },
|
||
{ type: 'done', usage: { prompt_tokens: 500, completion_tokens: 10 } },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: '/dup.ts' } },
|
||
{ type: 'done', usage: { prompt_tokens: 18_000, completion_tokens: 10 } },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{ contextManager: cm },
|
||
);
|
||
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(client.calls.length).toBe(3);
|
||
// The 3rd LLM call should see the older Read replaced with a placeholder
|
||
const thirdCallMessages = client.calls[2]!.messages as Array<{ role: string; content?: string }>;
|
||
const placeholderMessages = thirdCallMessages.filter(
|
||
(m) => m.role === 'tool' && typeof m.content === 'string' && m.content.includes('Duplicate Read of /dup.ts'),
|
||
);
|
||
expect(placeholderMessages.length).toBe(1);
|
||
// The latest Read should still carry the full content
|
||
const fullReadMessages = thirdCallMessages.filter(
|
||
(m) => m.role === 'tool' && typeof m.content === 'string' && m.content === largeContent,
|
||
);
|
||
expect(fullReadMessages.length).toBe(1);
|
||
});
|
||
|
||
it('summarizes old turns when dedup+compact alone cannot fit prompt', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const { SUMMARY_MARKER_PREFIX } = await import('./context/history-compactor.js');
|
||
// Tight context, distinct Bash outputs (so dedup cannot help) each below
|
||
// LARGE_TOOL_RESULT_TOKENS (so compaction cannot help either) — only
|
||
// history summarization can keep the conversation going.
|
||
// Phase 6c expanded the system prompt by ~200 chars (memory_update
|
||
// guidance + new tool definition); we bump the test limit a bit so
|
||
// summarization still fires on the 5th Bash turn rather than the 4th,
|
||
// matching the FakeClient slot layout below.
|
||
const cm = new ContextManager({ limitTokens: 35_000 });
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
|
||
executeToolMock.mockResolvedValue({ output: 'B'.repeat(18_000), isError: false });
|
||
|
||
// FakeClient.chat() walks a single response list, so the isolated
|
||
// summarization LLM call (runIsolatedLlm) consumes the next slot. Slot 5
|
||
// is the summary response; slot 6 is the post-summary transition.
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo 1' } }, { type: 'done', usage: { prompt_tokens: 1_000, completion_tokens: 5 } }],
|
||
[{ type: 'tool_use', id: 'b2', name: 'Bash', input: { command: 'echo 2' } }, { type: 'done', usage: { prompt_tokens: 5_500, completion_tokens: 5 } }],
|
||
[{ type: 'tool_use', id: 'b3', name: 'Bash', input: { command: 'echo 3' } }, { type: 'done', usage: { prompt_tokens: 10_000, completion_tokens: 5 } }],
|
||
[{ type: 'tool_use', id: 'b4', name: 'Bash', input: { command: 'echo 4' } }, { type: 'done', usage: { prompt_tokens: 15_000, completion_tokens: 5 } }],
|
||
[{ type: 'tool_use', id: 'b5', name: 'Bash', input: { command: 'echo 5' } }, { type: 'done', usage: { prompt_tokens: 20_000, completion_tokens: 5 } }],
|
||
[{ type: 'text', text: '## ゴール\nrun bash commands\n## 進捗\nDone: 5 commands' }, { type: 'done' }],
|
||
[
|
||
{ type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Bash']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{ contextManager: cm },
|
||
);
|
||
|
||
expect(result.next).toBe('COMPLETE');
|
||
const finalMessages = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: string }>;
|
||
const markerMessages = finalMessages.filter(
|
||
(m) => m.role === 'user' && typeof m.content === 'string' && m.content.startsWith(SUMMARY_MARKER_PREFIX),
|
||
);
|
||
// Phase 6c expanded the system prompt; the exact iteration where
|
||
// summarization fires shifted slightly. The invariant we care about
|
||
// is "summarization fired at least once before completion".
|
||
expect(markerMessages.length).toBeGreaterThanOrEqual(1);
|
||
});
|
||
|
||
it('aborts when initial prompt is oversized and defaultNext is terminal', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const cm = new ContextManager({ limitTokens: 1_000 });
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
// The guard will fail before the main LLM is called, but the
|
||
// last-resort summary path uses runIsolatedLlm (also via FakeClient),
|
||
// so we provide one response for that summary call.
|
||
const client = new FakeClient([
|
||
[{ type: 'text', text: '### Status\nstuck on overflow' }, { type: 'done' }],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'x'.repeat(10_000),
|
||
client as never,
|
||
makeContext(),
|
||
{ contextManager: cm },
|
||
);
|
||
|
||
// A terminal fallback would report a false completion, so overflow aborts.
|
||
expect(result.next).toBe('ABORT');
|
||
expect(result.abortCode).toBe('context_overflow');
|
||
expect(result.output).toContain('LLM request blocked before send');
|
||
// Only the isolated summary call should have happened — no main LLM call
|
||
expect(executeToolMock).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it('falls back to ABORT when oversized prompt has no defaultNext to force-transition to', async () => {
|
||
const { ContextManager } = await import('./context-manager.js');
|
||
const cm = new ContextManager({ limitTokens: 1_000 });
|
||
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[{ type: 'text', text: 'should not be called' }, { type: 'done' }],
|
||
]);
|
||
|
||
// Movement without defaultNext
|
||
const movement: Movement = {
|
||
name: 'execute',
|
||
edit: true,
|
||
persona: 'worker',
|
||
instruction: 'Do the work.',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
};
|
||
|
||
const result = await executeMovement(
|
||
movement,
|
||
'x'.repeat(10_000),
|
||
client as never,
|
||
makeContext(),
|
||
{ contextManager: cm },
|
||
);
|
||
|
||
expect(result.next).toBe('ABORT');
|
||
expect(result.output).toContain('LLM request blocked before send');
|
||
expect(executeToolMock).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it('returns ABORT immediately when cancelSignal is already aborted', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const controller = new AbortController();
|
||
controller.abort();
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{ cancelSignal: controller.signal },
|
||
);
|
||
|
||
expect(result.next).toBe('ABORT');
|
||
expect(result.output).toContain('cancelled');
|
||
expect(executeToolMock).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it('aborts after text-only responses without transition calls', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
|
||
// LLM returns text-only responses (no tool calls, no transition) 4 times
|
||
// MAX_TEXT_ONLY_RETRIES is 3, so after 3 text-only responses it should ABORT
|
||
const client = new FakeClient([
|
||
[{ type: 'text', text: 'I will do the work' }, { type: 'done' }],
|
||
[{ type: 'text', text: 'Still working on it' }, { type: 'done' }],
|
||
[{ type: 'text', text: 'Almost done' }, { type: 'done' }],
|
||
// Should not reach this
|
||
[{ type: 'text', text: 'Never reached' }, { type: 'done' }],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
);
|
||
|
||
expect(result.next).toBe('ABORT');
|
||
expect(executeToolMock).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it('resets text-only counter when tool calls happen in between', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
// Alternate: text-only → tool → text-only → tool → text-only → tool → complete
|
||
// Without the fix, the 3 text-only responses would trigger abort.
|
||
// With the fix, each tool call resets the counter so it never reaches 3 consecutive.
|
||
const client = new FakeClient([
|
||
[{ type: 'text', text: 'Thinking about the answer...' }, { type: 'done' }],
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done' },
|
||
],
|
||
[{ type: 'text', text: 'Found some info, analyzing...' }, { type: 'done' }],
|
||
[
|
||
{ type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'b.txt' } },
|
||
{ type: 'done' },
|
||
],
|
||
[{ type: 'text', text: 'Almost done with analysis' }, { type: 'done' }],
|
||
[
|
||
{ type: 'tool_use', id: 'complete-1', name: 'complete', input: { status: 'success', result: 'Done' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
);
|
||
|
||
expect(result.next).not.toBe('ABORT');
|
||
expect(executeToolMock).toHaveBeenCalled();
|
||
});
|
||
|
||
it('aborts when maxIterations is exceeded', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
// LLM keeps using tools but never calls transition
|
||
const toolResponses = Array.from({ length: 4 }, () => [
|
||
{ type: 'tool_use' as const, id: `read-${Math.random()}`, name: 'Read', input: { file_path: 'a.txt' } },
|
||
{ type: 'done' as const },
|
||
]);
|
||
|
||
const client = new FakeClient(toolResponses);
|
||
|
||
const result = await executeMovement(
|
||
makeMovement(['Read']),
|
||
'task',
|
||
client as never,
|
||
makeContext(),
|
||
{ maxIterations: 2 },
|
||
);
|
||
|
||
expect(result.next).toBe('ABORT');
|
||
expect(result.output).toContain('iteration');
|
||
});
|
||
|
||
});
|
||
|
||
describe('executeMovement cross-movement Read cache', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('returns a cached Read result when a later movement reads the same file', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
|
||
const fileContent = 'const answer = 42;\n';
|
||
executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
|
||
if (name === 'Read' && input['file_path'] === 'foo.ts') {
|
||
return { output: fileContent, isError: false };
|
||
}
|
||
return { output: 'unexpected', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
|
||
const movementA: Movement = {
|
||
name: 'investigate',
|
||
edit: false,
|
||
persona: 'investigator',
|
||
instruction: 'Look at foo.ts.',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'plan' }],
|
||
defaultNext: 'plan',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'foo.ts' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'transition', input: { next_step: 'plan', summary: 'foo.ts read' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const resultA = await executeMovement(movementA, 'task', clientA as never, makeContext(), {
|
||
toolResultCache: cache,
|
||
});
|
||
expect(resultA.next).toBe('plan');
|
||
expect(executeToolMock).toHaveBeenCalledTimes(1);
|
||
expect(cache.size()).toBe(1);
|
||
|
||
const observedToolResults: string[] = [];
|
||
const movementB: Movement = {
|
||
name: 'plan',
|
||
edit: false,
|
||
persona: 'planner',
|
||
instruction: 'Plan based on foo.ts.',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'foo.ts' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-2', name: 'complete', input: { status: 'success', result: 'used cached read' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const resultB = await executeMovement(movementB, 'task', clientB as never, makeContext(), {
|
||
toolResultCache: cache,
|
||
callbacks: {
|
||
onToolResult: (_name, info) => observedToolResults.push(info.result),
|
||
},
|
||
});
|
||
|
||
expect(resultB.next).toBe('COMPLETE');
|
||
// executeTool was NOT invoked again — the cache short-circuited the second Read.
|
||
expect(executeToolMock).toHaveBeenCalledTimes(1);
|
||
// The LLM saw a cached-tagged result, not the raw content.
|
||
expect(observedToolResults.some((r) => r.startsWith('[cached: Read foo.ts from movement investigate at '))).toBe(true);
|
||
expect(observedToolResults.some((r) => r.endsWith(fileContent))).toBe(true);
|
||
});
|
||
|
||
it('does not cache error results', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ENOENT: no such file', isError: true });
|
||
|
||
const cache = new ToolResultCache();
|
||
const movement: Movement = {
|
||
name: 'investigate',
|
||
edit: false,
|
||
persona: 'investigator',
|
||
instruction: 'Look at missing.ts.',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'missing.ts' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'tried' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(0);
|
||
});
|
||
|
||
it('skips caching tools outside the cacheable allowlist (Bash etc.)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
|
||
executeToolMock.mockResolvedValue({ output: 'shell ok', isError: false });
|
||
|
||
const cache = new ToolResultCache();
|
||
const movement: Movement = {
|
||
name: 'investigate',
|
||
edit: false,
|
||
persona: 'investigator',
|
||
instruction: 'Run.',
|
||
allowedTools: ['Bash'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'b-1', name: 'Bash', input: { command: 'echo hi' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'ran' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(0);
|
||
});
|
||
});
|
||
|
||
describe('executeMovement Phase 2 cache invalidation', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('invalidates cached Read after a successful Edit on the same path', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
|
||
|
||
const reads: number[] = [];
|
||
let version = 1;
|
||
executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
|
||
if (name === 'Read' && input['file_path'] === 'foo.ts') {
|
||
reads.push(version);
|
||
return { output: `version=${version}`, isError: false };
|
||
}
|
||
if (name === 'Edit' && input['file_path'] === 'foo.ts') {
|
||
version++;
|
||
return { output: 'edited', isError: false };
|
||
}
|
||
return { output: 'unexpected', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
|
||
const movementA: Movement = {
|
||
name: 'investigate',
|
||
edit: true,
|
||
persona: 'p',
|
||
instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 'read' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movementA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(1);
|
||
|
||
const movementB: Movement = {
|
||
name: 'execute',
|
||
edit: true,
|
||
persona: 'p',
|
||
instruction: 'i',
|
||
allowedTools: ['Edit'],
|
||
rules: [{ condition: 'done', next: 'verify' }],
|
||
defaultNext: 'verify',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'transition', input: { next_step: 'verify', summary: 'edited' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movementB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(0); // Edit evicted the Read entry
|
||
|
||
const observed: string[] = [];
|
||
const movementC: Movement = {
|
||
name: 'verify',
|
||
edit: false,
|
||
persona: 'p',
|
||
instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientC = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't3', name: 'complete', input: { status: 'success', result: 're-read' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movementC, 'task', clientC as never, makeContext(), {
|
||
toolResultCache: cache,
|
||
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
|
||
});
|
||
|
||
// Re-read happened (no cache hit) and saw the post-edit version.
|
||
expect(reads).toEqual([1, 2]);
|
||
expect(observed.some((r) => r === 'version=2')).toBe(true);
|
||
expect(observed.some((r) => r.startsWith('[cached:'))).toBe(false);
|
||
});
|
||
|
||
it('invalidates only the affected path; other cached files survive', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
|
||
|
||
const callLog: Array<{ name: string; path?: string }> = [];
|
||
executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
|
||
callLog.push({ name, path: input['file_path'] as string | undefined });
|
||
if (name === 'Read') return { output: `${input['file_path']} body`, isError: false };
|
||
if (name === 'Edit') return { output: 'edited', isError: false };
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(2);
|
||
|
||
const movB: Movement = {
|
||
name: 'execute', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Edit', 'Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const observed: string[] = [];
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
|
||
[
|
||
{ type: 'tool_use', id: 'r3', name: 'Read', input: { file_path: 'foo.ts' } },
|
||
{ type: 'tool_use', id: 'r4', name: 'Read', input: { file_path: 'bar.ts' } },
|
||
{ type: 'done' },
|
||
],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, makeContext(), {
|
||
toolResultCache: cache,
|
||
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
|
||
});
|
||
|
||
const fooReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'foo.ts').length;
|
||
const barReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'bar.ts').length;
|
||
expect(fooReadCount).toBe(2); // re-read after Edit
|
||
expect(barReadCount).toBe(1); // bar still cached
|
||
expect(observed.some((r) => r.startsWith('[cached: Read bar.ts'))).toBe(true);
|
||
expect(observed.some((r) => r.startsWith('[cached: Read foo.ts'))).toBe(false);
|
||
});
|
||
|
||
it('Bash invalidates every file-derived entry conservatively', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Bash']));
|
||
|
||
let readCalls = 0;
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'Read') {
|
||
readCalls++;
|
||
return { output: `read ${readCalls}`, isError: false };
|
||
}
|
||
if (name === 'Bash') return { output: 'shell ok', isError: false };
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(2);
|
||
|
||
const movB: Movement = {
|
||
name: 'execute', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Bash'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo hi' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(0); // every Read evicted
|
||
});
|
||
|
||
it('does NOT invalidate when Edit/Write/Bash returned an error', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
|
||
|
||
let editAttempts = 0;
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'Read') return { output: 'body', isError: false };
|
||
if (name === 'Edit') {
|
||
editAttempts++;
|
||
return { output: 'permission denied', isError: true };
|
||
}
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
expect(cache.size()).toBe(1);
|
||
|
||
const movB: Movement = {
|
||
name: 'execute', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Edit'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
|
||
|
||
expect(editAttempts).toBe(1);
|
||
expect(cache.size()).toBe(1); // failed Edit kept the cache intact
|
||
});
|
||
});
|
||
|
||
describe('executeMovement Phase 3 WorkspaceMemory', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('persists memory_update from one movement and exposes it in the next system prompt', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
|
||
|
||
const memory = new WorkspaceMemory();
|
||
|
||
const movementA: Movement = {
|
||
name: 'investigate',
|
||
edit: false,
|
||
persona: 'investigator',
|
||
instruction: 'Find the bug.',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'plan' }],
|
||
defaultNext: 'plan',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[
|
||
{
|
||
type: 'tool_use',
|
||
id: 't1',
|
||
name: 'transition',
|
||
input: {
|
||
next_step: 'plan',
|
||
summary: 'investigated',
|
||
memory_update: {
|
||
facts: [
|
||
{ claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'], confidence: 'high' },
|
||
],
|
||
decisions: [{ text: 'patch foo.ts directly', evidence_paths: ['foo.ts'] }],
|
||
do_not_repeat: ['re-read foo.ts unless evidence breaks'],
|
||
},
|
||
},
|
||
},
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
await executeMovement(movementA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });
|
||
|
||
expect(memory.size().facts).toBe(1);
|
||
expect(memory.size().decisions).toBe(1);
|
||
expect(memory.size().doNotRepeat).toBe(1);
|
||
|
||
const movementB: Movement = {
|
||
name: 'plan',
|
||
edit: false,
|
||
persona: 'planner',
|
||
instruction: 'Plan the fix.',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'planned' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movementB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });
|
||
|
||
const systemMsg = clientB.calls[0]?.messages as Array<{ role: string; content: string }>;
|
||
expect(systemMsg[0]!.role).toBe('system');
|
||
expect(systemMsg[0]!.content).toContain('## これまでに蓄積した観測');
|
||
expect(systemMsg[0]!.content).toContain('foo.ts uses bar()');
|
||
expect(systemMsg[0]!.content).toContain('patch foo.ts directly');
|
||
expect(systemMsg[0]!.content).toContain('re-read foo.ts unless evidence breaks');
|
||
expect(systemMsg[0]!.content).toContain('memory は再調査禁止の根拠ではなく');
|
||
});
|
||
|
||
it('omits memory section when memory is empty', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const movement: Movement = {
|
||
name: 'investigate',
|
||
edit: false,
|
||
persona: 'p',
|
||
instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
|
||
const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
|
||
expect(systemMsg[0]!.content).not.toContain('## これまでに蓄積した観測');
|
||
});
|
||
|
||
it('Edit invalidates a fact whose evidence_paths includes the edited file', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'Read') return { output: 'body', isError: false };
|
||
if (name === 'Edit') return { output: 'edited', isError: false };
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const memory = new WorkspaceMemory();
|
||
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[
|
||
{
|
||
type: 'tool_use',
|
||
id: 't1',
|
||
name: 'transition',
|
||
input: {
|
||
next_step: 'execute',
|
||
summary: 'investigated',
|
||
memory_update: {
|
||
facts: [
|
||
{ claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'] },
|
||
{ claim: 'unrelated truth', evidence_paths: ['bar.ts'] },
|
||
],
|
||
},
|
||
},
|
||
},
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });
|
||
expect(memory.snapshot().facts).toHaveLength(2);
|
||
|
||
const movB: Movement = {
|
||
name: 'execute', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Edit'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'edited' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });
|
||
|
||
const snapshotAfter = memory.snapshot();
|
||
expect(snapshotAfter.facts).toHaveLength(1);
|
||
expect(snapshotAfter.facts[0]!.claim).toBe('unrelated truth');
|
||
});
|
||
|
||
it('still works when piece submits no memory_update (backward compat)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
|
||
|
||
const memory = new WorkspaceMemory();
|
||
const movement: Movement = {
|
||
name: 'investigate',
|
||
edit: false,
|
||
persona: 'p',
|
||
instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'just summary' } }, { type: 'done' }],
|
||
]);
|
||
const result = await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(memory.size().facts).toBe(0);
|
||
});
|
||
});
|
||
|
||
describe('executeMovement Phase 4 cache extension', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('caches Grep results and serves them to a later movement', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep']));
|
||
let grepCalls = 0;
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'Grep') {
|
||
grepCalls++;
|
||
return { output: 'matches in src/foo.ts:1\n', isError: false };
|
||
}
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Grep'],
|
||
rules: [{ condition: 'done', next: 'plan' }],
|
||
defaultNext: 'plan',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
expect(grepCalls).toBe(1);
|
||
|
||
const observed: string[] = [];
|
||
const movB: Movement = {
|
||
name: 'plan', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Grep'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, makeContext(), {
|
||
toolResultCache: cache,
|
||
callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
|
||
});
|
||
|
||
expect(grepCalls).toBe(1);
|
||
expect(observed.some((r) => r.startsWith('[cached: Grep foo'))).toBe(true);
|
||
});
|
||
|
||
it('Edit invalidates ALL search entries (Grep) regardless of scope', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep', 'Edit']));
|
||
let grepCalls = 0;
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'Grep') {
|
||
grepCalls++;
|
||
return { output: `match v${grepCalls}`, isError: false };
|
||
}
|
||
if (name === 'Edit') return { output: 'edited', isError: false };
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Grep'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
|
||
const movB: Movement = {
|
||
name: 'execute', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Edit', 'Grep'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'unrelated.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
|
||
|
||
expect(grepCalls).toBe(2);
|
||
});
|
||
|
||
it('caches WebFetch and survives Edit invalidation', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['WebFetch', 'Edit']));
|
||
let fetchCalls = 0;
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'WebFetch') {
|
||
fetchCalls++;
|
||
return { output: 'page body', isError: false };
|
||
}
|
||
if (name === 'Edit') return { output: 'edited', isError: false };
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['WebFetch'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'w1', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
|
||
const movB: Movement = {
|
||
name: 'execute', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Edit', 'WebFetch'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'w2', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
|
||
|
||
expect(fetchCalls).toBe(1);
|
||
});
|
||
|
||
it('caches Office tool results (ReadPdf) keyed by path + range', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['ReadPdf']));
|
||
let pdfCalls = 0;
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'ReadPdf') {
|
||
pdfCalls++;
|
||
return { output: 'pdf body', isError: false };
|
||
}
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
|
||
const mov: Movement = {
|
||
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['ReadPdf'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'p1', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(mov, 'task', clientA as never, makeContext(), { toolResultCache: cache });
|
||
expect(pdfCalls).toBe(1);
|
||
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'p2', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'p3', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '4-6' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(mov, 'task', clientB as never, makeContext(), { toolResultCache: cache });
|
||
expect(pdfCalls).toBe(2);
|
||
});
|
||
});
|
||
|
||
describe('buildSystemPrompt checklist guidance', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('emits a "進捗管理" section with strengthened CreateChecklist guidance', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const movement: Movement = {
|
||
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movement, 'task', client as never, makeContext());
|
||
|
||
const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
|
||
expect(systemMsg[0]!.role).toBe('system');
|
||
expect(systemMsg[0]!.content).toContain('## 進捗管理');
|
||
expect(systemMsg[0]!.content).toContain('CreateChecklist');
|
||
expect(systemMsg[0]!.content).toContain('全 piece で常時利用可能');
|
||
// Stronger language (option A): explicit "3 個以上のツール" criterion +
|
||
// "判断に迷ったら作る" + 2 回目以降への明示注意。
|
||
expect(systemMsg[0]!.content).toContain('3 個以上のツール呼び出し');
|
||
expect(systemMsg[0]!.content).toContain('判断に迷ったら作る');
|
||
expect(systemMsg[0]!.content).toContain('2 回目以降のやり取り');
|
||
});
|
||
});
|
||
|
||
describe('checklist watchdog (option B)', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('injects a one-shot reminder after 5 iterations without any checklist tool', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
// 6 iterations of plain Read → 1 iteration with complete. The watchdog
|
||
// should fire on iteration 5 (0-indexed).
|
||
const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown; text?: string }>> = [];
|
||
for (let i = 0; i < 6; i++) {
|
||
responses.push([
|
||
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
|
||
{ type: 'done' },
|
||
]);
|
||
}
|
||
responses.push([
|
||
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
]);
|
||
const client = new FakeClient(responses as never);
|
||
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
|
||
// Find any user-role message containing the watchdog tag.
|
||
const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
|
||
const watchdogMessages = allMessagesAtFinalCall.filter(
|
||
(m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
|
||
);
|
||
expect(watchdogMessages.length).toBe(1);
|
||
});
|
||
|
||
it('does NOT fire when CreateChecklist is called early', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'CreateChecklist']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
// CreateChecklist on iteration 0, then several Read, then complete.
|
||
const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown }>> = [
|
||
[{ type: 'tool_use', id: 'cc', name: 'CreateChecklist', input: { name: 'plan', items: [] } }, { type: 'done' }],
|
||
];
|
||
for (let i = 0; i < 7; i++) {
|
||
responses.push([
|
||
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
|
||
{ type: 'done' },
|
||
]);
|
||
}
|
||
responses.push([
|
||
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
]);
|
||
const client = new FakeClient(responses as never);
|
||
|
||
await executeMovement(makeMovement(['Read', 'CreateChecklist']), 'task', client as never, makeContext());
|
||
|
||
const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
|
||
const watchdogMessages = allMessagesAtFinalCall.filter(
|
||
(m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
|
||
);
|
||
expect(watchdogMessages.length).toBe(0);
|
||
});
|
||
});
|
||
|
||
// ============================================================
|
||
// Phase 6a — `complete` tool (§7.1–7.7 of design plan)
|
||
// ============================================================
|
||
|
||
describe('Phase 6a: complete tool — §7.1 native basics', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('success status with result becomes the movement output (no chatter contamination)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'text', text: "I'll figure this out first" }, // chatter to be ignored
|
||
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'final answer' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(result.output).toBe('final answer');
|
||
expect(result.output).not.toContain("I'll figure this out first");
|
||
});
|
||
|
||
it('aborted status routes via ABORT next', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted', abort_reason: 'tool unavailable' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.next).toBe('ABORT');
|
||
expect(result.output).toBe('tool unavailable');
|
||
});
|
||
|
||
it('needs_user_input routes via ASK next', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[
|
||
{
|
||
type: 'tool_use', id: 'c1', name: 'complete',
|
||
input: { status: 'needs_user_input', missing_info: 'which dataset?', why_no_default: 'multiple options' },
|
||
},
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.next).toBe('ASK');
|
||
expect(result.output).toBe('which dataset?');
|
||
});
|
||
|
||
it('rejects success with empty result and forces retry (no accumulatedText fallback for native)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'text', text: 'this would have been the fallback' },
|
||
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'real answer' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(result.output).toBe('real answer');
|
||
// The fallback chatter must NOT have leaked into the output via retry.
|
||
expect(result.output).not.toContain('this would have been the fallback');
|
||
});
|
||
|
||
it('rejects aborted without abort_reason and forces retry', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'aborted', abort_reason: 'real reason' } }, { type: 'done' }],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.next).toBe('ABORT');
|
||
expect(result.output).toBe('real reason');
|
||
});
|
||
});
|
||
|
||
describe('Phase 6a: complete tool — §7.2 precedence rules (§2.5)', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('invalid native complete forces retry (no fallback)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
// First iteration: invalid (empty result) — must retry
|
||
[
|
||
{ type: 'text', text: 'this would have been the fallback' },
|
||
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
|
||
{ type: 'done' },
|
||
],
|
||
// Second iteration: LLM corrects with valid native
|
||
[
|
||
{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'native corrected' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.output).toBe('native corrected');
|
||
// accumulatedText must NOT have leaked into the output via retry
|
||
expect(result.output).not.toContain('fallback');
|
||
});
|
||
|
||
it('two native completes with conflicting args → retry', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'A' } },
|
||
{ type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'B' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'consistent' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.output).toBe('consistent');
|
||
});
|
||
|
||
it('two native completes with identical args → first one used', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'same answer' } },
|
||
{ type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'same answer' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
expect(result.output).toBe('same answer');
|
||
});
|
||
|
||
});
|
||
|
||
describe('Phase 6a: complete tool — §7.4 regressions', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('non-terminal transition (movement-to-movement) still works', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const movement: Movement = {
|
||
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'plan' }],
|
||
defaultNext: 'plan',
|
||
};
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 'investigated' } }, { type: 'done' }],
|
||
]);
|
||
const result = await executeMovement(movement, 'task', client as never, makeContext());
|
||
expect(result.next).toBe('plan');
|
||
expect(result.output).toBe('investigated');
|
||
});
|
||
|
||
it('transition({next_step:"COMPLETE"}) is rejected (Phase 6b)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
// FakeClient bypasses tool-schema validation, but the engine's
|
||
// validateTransition still rejects unknown next values, so an LLM
|
||
// emitting a legacy terminal transition gets a tool-result error and
|
||
// must retry. This locks in the no-shim behavior.
|
||
const client = new FakeClient([
|
||
[{
|
||
type: 'tool_use', id: 't1', name: 'transition',
|
||
input: { next_step: 'COMPLETE', summary: 'should not be accepted' },
|
||
}, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'corrected' } }, { type: 'done' }],
|
||
]);
|
||
const movement: Movement = {
|
||
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
// No COMPLETE in rules — legacy terminal cannot succeed.
|
||
rules: [{ condition: 'fix needed', next: 'fix' }],
|
||
};
|
||
const result = await executeMovement(movement, 'task', client as never, makeContext());
|
||
expect(result.output).toBe('corrected');
|
||
});
|
||
});
|
||
|
||
describe('Phase 6a: complete tool — §7.5 memory_update behavior', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('memory_update inside complete is applied to WorkspaceMemory (success)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const client = new FakeClient([
|
||
[{
|
||
type: 'tool_use', id: 'c1', name: 'complete',
|
||
input: {
|
||
status: 'success',
|
||
result: 'done',
|
||
memory_update: { facts: [{ claim: 'X is Y', confidence: 'high' }] },
|
||
},
|
||
}, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
expect(memory.size().facts).toBe(1);
|
||
});
|
||
|
||
it('memory_update is NOT applied when complete is invalid and retried', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const client = new FakeClient([
|
||
// Invalid first attempt — memory_update must NOT commit
|
||
[{
|
||
type: 'tool_use', id: 'c1', name: 'complete',
|
||
input: {
|
||
status: 'success',
|
||
result: '',
|
||
memory_update: { facts: [{ claim: 'should not commit', confidence: 'high' }] },
|
||
},
|
||
}, { type: 'done' }],
|
||
// Valid retry — different fact, must commit only this one
|
||
[{
|
||
type: 'tool_use', id: 'c2', name: 'complete',
|
||
input: {
|
||
status: 'success',
|
||
result: 'fixed',
|
||
memory_update: { facts: [{ claim: 'committed once', confidence: 'high' }] },
|
||
},
|
||
}, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
expect(memory.size().facts).toBe(1);
|
||
expect(memory.snapshot().facts[0]!.claim).toBe('committed once');
|
||
});
|
||
|
||
it('memory_update applied for status=aborted', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const client = new FakeClient([
|
||
[{
|
||
type: 'tool_use', id: 'c1', name: 'complete',
|
||
input: {
|
||
status: 'aborted',
|
||
abort_reason: 'tool unavailable',
|
||
memory_update: { facts: [{ claim: 'lesson from failure', confidence: 'medium' }] },
|
||
},
|
||
}, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
expect(memory.size().facts).toBe(1);
|
||
});
|
||
});
|
||
|
||
describe('Phase 6a: complete tool — §7.7 v3 critical (Conditional Go) tests', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('all tool_use ids get a tool_result on retry (conversation history integrity)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'fail-1', name: 'complete', input: { status: 'success', result: '' } },
|
||
{ type: 'tool_use', id: 'ignored-1', name: 'complete', input: { status: 'success', result: 'X' } },
|
||
{ type: 'done' },
|
||
],
|
||
[
|
||
{ type: 'tool_use', id: 'good', name: 'complete', input: { status: 'success', result: 'recovered' } },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
|
||
|
||
// Inspect the messages sent on the second call — every prior tool_use must have a tool_result.
|
||
const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; tool_call_id?: string }>;
|
||
const toolResultIds = secondCallMessages.filter((m) => m.role === 'tool').map((m) => m.tool_call_id);
|
||
expect(toolResultIds).toContain('fail-1');
|
||
expect(toolResultIds).toContain('ignored-1');
|
||
});
|
||
});
|
||
|
||
// ============================================================
|
||
// Phase 6c — `memory_update` standalone tool
|
||
// ============================================================
|
||
|
||
describe('Phase 6c: memory_update tool', () => {
|
||
afterEach(() => {
|
||
executeToolMock.mockReset();
|
||
getToolDefsMock.mockReset();
|
||
});
|
||
|
||
it('mid-movement memory_update commits before terminal selection (visible to LLM next iteration)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const client = new FakeClient([
|
||
// iteration 0: emit memory_update only
|
||
[
|
||
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
|
||
facts: [{ claim: 'mid-movement observation' }],
|
||
} },
|
||
{ type: 'done' },
|
||
],
|
||
// iteration 1: terminal complete
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
expect(result.next).toBe('COMPLETE');
|
||
expect(memory.size().facts).toBe(1);
|
||
expect(memory.snapshot().facts[0]!.claim).toBe('mid-movement observation');
|
||
});
|
||
|
||
it('returns "no changes committed" tool_result for empty payload', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const observed: string[] = [];
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {} },
|
||
{ type: 'done' },
|
||
],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), {
|
||
workspaceMemory: memory,
|
||
});
|
||
// The next iteration's messages include the tool_result for mu-1.
|
||
const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; content?: unknown; tool_call_id?: string }>;
|
||
const muResult = secondCallMessages.find((m) => m.role === 'tool' && m.tool_call_id === 'mu-1');
|
||
expect(muResult).toBeDefined();
|
||
expect(String(muResult?.content)).toMatch(/no changes committed/);
|
||
expect(memory.size().facts).toBe(0);
|
||
});
|
||
|
||
it('exact-claim duplicate within same iteration is merged (Codex Phase 6c §2.5)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
|
||
facts: [{ claim: 'X is Y', evidence_paths: ['a.ts'] }],
|
||
} },
|
||
{ type: 'tool_use', id: 'mu-2', name: 'memory_update', input: {
|
||
facts: [{ claim: 'X is Y', evidence_paths: ['b.ts'] }],
|
||
} },
|
||
{ type: 'done' },
|
||
],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
const snap = memory.snapshot();
|
||
expect(snap.facts).toHaveLength(1);
|
||
// evidence_paths should be union-merged, not duplicated.
|
||
expect(snap.facts[0]!.evidencePaths.sort()).toEqual(['a.ts', 'b.ts']);
|
||
});
|
||
|
||
it('memory_update commits even when complete in the same iteration is invalid (retry path)', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const client = new FakeClient([
|
||
// iteration 0: memory_update + invalid complete (empty result) → retry
|
||
[
|
||
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
|
||
facts: [{ claim: 'observation persists' }],
|
||
} },
|
||
{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
|
||
{ type: 'done' },
|
||
],
|
||
// iteration 1: corrected complete
|
||
[{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
expect(result.next).toBe('COMPLETE');
|
||
// The memory_update from the FIRST iteration should still be committed,
|
||
// even though the iteration retried due to invalid complete args.
|
||
expect(memory.snapshot().facts.map((f) => f.claim)).toContain('observation persists');
|
||
});
|
||
|
||
it('memory_update is a META_TOOL — works even when allowed_tools is empty', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs([]));
|
||
const memory = new WorkspaceMemory();
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
|
||
facts: [{ claim: 'still works' }],
|
||
} },
|
||
{ type: 'done' },
|
||
],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
const movement: Movement = {
|
||
name: 'execute', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: [], // empty — yet memory_update is in the tool catalog
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
|
||
expect(memory.size().facts).toBe(1);
|
||
});
|
||
});
|
||
|
||
// ============================================================
|
||
// Traceability T-1 — events.jsonl emission scenarios
|
||
// ============================================================
|
||
|
||
import { mkdtempSync, rmSync, readFileSync } from 'node:fs';
|
||
import { tmpdir } from 'node:os';
|
||
import { join } from 'node:path';
|
||
import { createFileEventLogger, parseEventLine, type EventBase } from '../progress/event-log.js';
|
||
|
||
function readEvents(workspace: string): EventBase[] {
|
||
const path = join(workspace, 'logs', 'events.jsonl');
|
||
const lines = readFileSync(path, 'utf-8').trim().split('\n').filter((l) => l.length > 0);
|
||
return lines.map((l) => {
|
||
const parsed = parseEventLine(l);
|
||
if (parsed.kind !== 'ok') throw new Error(`unexpected non-ok event line: ${l}`);
|
||
return parsed.event;
|
||
});
|
||
}
|
||
|
||
describe('Traceability T-1: agent-loop emission', () => {
|
||
let workspace: string;
|
||
|
||
beforeEach(() => {
|
||
workspace = mkdtempSync(join(tmpdir(), 'trace-emission-'));
|
||
});
|
||
|
||
afterEach(() => {
|
||
rmSync(workspace, { recursive: true, force: true });
|
||
});
|
||
|
||
function makeContextWithEvents(workspaceLocal: string): ToolContext {
|
||
return {
|
||
workspacePath: workspaceLocal,
|
||
editAllowed: true,
|
||
eventLogger: createFileEventLogger({ workspacePath: workspaceLocal, runId: 'test-run' }),
|
||
};
|
||
}
|
||
|
||
it('emits movement_start, tool_call/result pair, and movement_complete for a single Read flow', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
|
||
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));
|
||
|
||
const events = readEvents(workspace);
|
||
const kinds = events.map((e) => e.kind);
|
||
expect(kinds).toContain('movement_start');
|
||
expect(kinds).toContain('tool_call');
|
||
expect(kinds).toContain('tool_result');
|
||
expect(kinds).toContain('complete');
|
||
expect(kinds).toContain('movement_complete');
|
||
|
||
const call = events.find((e) => e.kind === 'tool_call' && e.llmToolCallId === 'r1');
|
||
const result = events.find((e) => e.kind === 'tool_result' && e.llmToolCallId === 'r1');
|
||
expect(call?.correlationId).toBeTruthy();
|
||
expect(result?.correlationId).toBe(call?.correlationId);
|
||
expect((result?.payload as { cacheHit: boolean }).cacheHit).toBe(false);
|
||
});
|
||
|
||
it('emits cache_set on first Read and cache_hit on second Read of the same file', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'body', isError: false });
|
||
|
||
const cache = new ToolResultCache();
|
||
const ctx = makeContextWithEvents(workspace);
|
||
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'plan' }],
|
||
defaultNext: 'plan',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache });
|
||
|
||
const movB: Movement = {
|
||
name: 'plan', edit: false, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache });
|
||
|
||
const events = readEvents(workspace);
|
||
expect(events.some((e) => e.kind === 'cache_set')).toBe(true);
|
||
expect(events.some((e) => e.kind === 'cache_hit')).toBe(true);
|
||
const cacheHit = events.find((e) => e.kind === 'cache_hit');
|
||
expect((cacheHit?.payload as { sourceMovement: string }).sourceMovement).toBe('investigate');
|
||
});
|
||
|
||
it('emits cache_invalidate and memory_invalidate after a successful Edit', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
|
||
executeToolMock.mockImplementation(async (name: string) => {
|
||
if (name === 'Read') return { output: 'body', isError: false };
|
||
if (name === 'Edit') return { output: 'edited', isError: false };
|
||
return { output: 'x', isError: true };
|
||
});
|
||
|
||
const cache = new ToolResultCache();
|
||
const memory = new WorkspaceMemory();
|
||
const ctx = makeContextWithEvents(workspace);
|
||
|
||
const movA: Movement = {
|
||
name: 'investigate', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Read'],
|
||
rules: [{ condition: 'done', next: 'execute' }],
|
||
defaultNext: 'execute',
|
||
};
|
||
const clientA = new FakeClient([
|
||
[{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
|
||
[
|
||
{ type: 'tool_use', id: 't1', name: 'transition', input: {
|
||
next_step: 'execute',
|
||
summary: 's',
|
||
memory_update: { facts: [{ claim: 'foo.ts uses bar', evidence_paths: ['foo.ts'] }] },
|
||
} },
|
||
{ type: 'done' },
|
||
],
|
||
]);
|
||
await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache, workspaceMemory: memory });
|
||
|
||
const movB: Movement = {
|
||
name: 'execute', edit: true, persona: 'p', instruction: 'i',
|
||
allowedTools: ['Edit'],
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
const clientB = new FakeClient([
|
||
[{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache, workspaceMemory: memory });
|
||
|
||
const events = readEvents(workspace);
|
||
expect(events.some((e) => e.kind === 'cache_invalidate')).toBe(true);
|
||
expect(events.some((e) => e.kind === 'memory_invalidate')).toBe(true);
|
||
});
|
||
|
||
it('emits memory_update_call with counts when the LLM calls memory_update', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const memory = new WorkspaceMemory();
|
||
const ctx = makeContextWithEvents(workspace);
|
||
const client = new FakeClient([
|
||
[
|
||
{ type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'x' }, { claim: 'y' }] } },
|
||
{ type: 'done' },
|
||
],
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, ctx, { workspaceMemory: memory });
|
||
|
||
const events = readEvents(workspace);
|
||
const muCall = events.find((e) => e.kind === 'memory_update_call');
|
||
expect(muCall).toBeDefined();
|
||
const payload = muCall?.payload as { counts: { factsAdded: number } };
|
||
expect(payload.counts.factsAdded).toBe(2);
|
||
});
|
||
|
||
it('emits watchdog_fire when no checklist tool is used in 5 iterations', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
executeToolMock.mockResolvedValue({ output: 'ok', isError: false });
|
||
|
||
const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown }>> = [];
|
||
for (let i = 0; i < 6; i++) {
|
||
responses.push([
|
||
{ type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
|
||
{ type: 'done' },
|
||
]);
|
||
}
|
||
responses.push([
|
||
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
|
||
{ type: 'done' },
|
||
]);
|
||
const client = new FakeClient(responses as never);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));
|
||
|
||
const events = readEvents(workspace);
|
||
expect(events.some((e) => e.kind === 'watchdog_fire')).toBe(true);
|
||
});
|
||
|
||
it('all events share the same runId across one piece run', async () => {
|
||
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
|
||
const ctx = makeContextWithEvents(workspace);
|
||
const client = new FakeClient([
|
||
[{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
|
||
]);
|
||
await executeMovement(makeMovement(['Read']), 'task', client as never, ctx);
|
||
|
||
const events = readEvents(workspace);
|
||
expect(events.length).toBeGreaterThan(0);
|
||
const runIds = new Set(events.map((e) => e.runId));
|
||
expect(runIds.size).toBe(1);
|
||
expect(runIds.has('test-run')).toBe(true);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Phase 4 (SSH Console): buildSystemPrompt screen injection
|
||
// ---------------------------------------------------------------------------
|
||
import { buildSystemPrompt, __setActiveSessionLookup, type HandoffContext } from './agent-loop.js';
|
||
|
||
describe('buildSystemPrompt console injection', () => {
|
||
afterEach(() => {
|
||
__setActiveSessionLookup(null);
|
||
});
|
||
|
||
function makeConsoleMovement(allowedTools: string[]): Movement {
|
||
return {
|
||
name: 'm1',
|
||
edit: false,
|
||
persona: 'p',
|
||
instruction: 'i',
|
||
allowedTools,
|
||
rules: [{ condition: 'done', next: 'COMPLETE' }],
|
||
defaultNext: 'COMPLETE',
|
||
};
|
||
}
|
||
|
||
it('appends screen block when SshConsole* in allowed_tools and session exists', () => {
|
||
const fakeSession = {
|
||
cols: 80,
|
||
rows: 24,
|
||
snapshotScreen: () => ({ text: 'mock-screen-line-1\nmock-screen-line-2' }),
|
||
};
|
||
__setActiveSessionLookup((_tid: string) => fakeSession);
|
||
|
||
const sys = buildSystemPrompt(
|
||
makeConsoleMovement(['SshConsoleEnsure', 'SshConsoleSend', 'SshConsoleSnapshot']),
|
||
1,
|
||
5,
|
||
[],
|
||
undefined,
|
||
null,
|
||
undefined,
|
||
undefined,
|
||
undefined,
|
||
't1',
|
||
);
|
||
expect(sys).toContain('Console screen');
|
||
expect(sys).toContain('mock-screen-line-1');
|
||
expect(sys).toContain('mock-screen-line-2');
|
||
});
|
||
|
||
it('does NOT inject when piece does not allow console tools', () => {
|
||
const fakeSession = {
|
||
cols: 80,
|
||
rows: 24,
|
||
snapshotScreen: () => ({ text: 'mock-screen-X' }),
|
||
};
|
||
__setActiveSessionLookup((_tid: string) => fakeSession);
|
||
|
||
const sys = buildSystemPrompt(
|
||
makeConsoleMovement(['Read', 'Bash']),
|
||
1,
|
||
5,
|
||
[],
|
||
undefined,
|
||
null,
|
||
undefined,
|
||
undefined,
|
||
undefined,
|
||
't1',
|
||
);
|
||
expect(sys).not.toContain('Console screen');
|
||
expect(sys).not.toContain('mock-screen-X');
|
||
});
|
||
|
||
it('does NOT inject when no active session is registered for the task', () => {
|
||
__setActiveSessionLookup((_tid: string) => null);
|
||
|
||
const sys = buildSystemPrompt(
|
||
makeConsoleMovement(['SshConsoleSend']),
|
||
1,
|
||
5,
|
||
[],
|
||
undefined,
|
||
null,
|
||
undefined,
|
||
undefined,
|
||
undefined,
|
||
't1',
|
||
);
|
||
expect(sys).not.toContain('Console screen');
|
||
});
|
||
|
||
it('does NOT inject when taskId is missing (subtask without local_task binding)', () => {
|
||
const fakeSession = {
|
||
cols: 80,
|
||
rows: 24,
|
||
snapshotScreen: () => ({ text: 'mock-screen' }),
|
||
};
|
||
let lookedUp: string | null = null;
|
||
__setActiveSessionLookup((tid: string) => {
|
||
lookedUp = tid;
|
||
return fakeSession;
|
||
});
|
||
|
||
const sys = buildSystemPrompt(
|
||
makeConsoleMovement(['SshConsoleSnapshot']),
|
||
1,
|
||
5,
|
||
[],
|
||
undefined,
|
||
null,
|
||
undefined,
|
||
undefined,
|
||
undefined,
|
||
null,
|
||
);
|
||
expect(sys).not.toContain('Console screen');
|
||
expect(lookedUp).toBeNull();
|
||
});
|
||
|
||
it('truncates injected screen to the configured tail length', () => {
|
||
const allLines = Array.from({ length: 200 }, (_, i) => `line-${i + 1}`);
|
||
const fakeSession = {
|
||
cols: 80,
|
||
rows: 24,
|
||
snapshotScreen: () => ({ text: allLines.join('\n') }),
|
||
};
|
||
__setActiveSessionLookup((_tid: string) => fakeSession);
|
||
|
||
const sys = buildSystemPrompt(
|
||
makeConsoleMovement(['SshConsoleSend']),
|
||
1,
|
||
5,
|
||
[],
|
||
undefined,
|
||
null,
|
||
undefined,
|
||
undefined,
|
||
undefined,
|
||
't1',
|
||
);
|
||
// Default tail = 24 lines: should contain the last line but not very early lines.
|
||
expect(sys).toContain('line-200');
|
||
expect(sys).toContain('line-177'); // 200 - 24 + 1 = 177 (inclusive tail)
|
||
expect(sys).not.toContain('line-1\n');
|
||
expect(sys).not.toContain('line-50');
|
||
});
|
||
});
|
||
|
||
describe('buildSystemPrompt — handoff blocks', () => {
|
||
// buildSystemPrompt is pure — we can call it directly. Build a minimal
|
||
// Movement that won't trigger errors in the rendering pipeline.
|
||
const movement: Movement = {
|
||
name: 'investigate',
|
||
persona: 'investigator',
|
||
instruction: 'do the thing',
|
||
rules: [{ condition: 'done', next: 'plan' }],
|
||
allowedTools: [],
|
||
edit: false,
|
||
};
|
||
|
||
it('always includes the static Continue block', () => {
|
||
const prompt = buildSystemPrompt(movement);
|
||
expect(prompt).toContain('Continue 機能');
|
||
expect(prompt).toContain('output/ ファイルは次の piece でもそのまま参照されます');
|
||
});
|
||
|
||
it('omits the dynamic block when handoffContext is not provided', () => {
|
||
const prompt = buildSystemPrompt(movement);
|
||
expect(prompt).not.toContain('前 piece からの引き継ぎ');
|
||
});
|
||
|
||
it('includes the dynamic block with prev piece + result when handoffContext is provided', () => {
|
||
const handoff: HandoffContext = {
|
||
prevPiece: 'manual-writer',
|
||
prevResult: 'output/manual.md is ready, with 5 sections covering setup',
|
||
};
|
||
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
|
||
expect(prompt).toContain('前 piece からの引き継ぎ');
|
||
expect(prompt).toContain('manual-writer');
|
||
expect(prompt).toContain('output/manual.md is ready');
|
||
});
|
||
|
||
it('handles null prevResult gracefully', () => {
|
||
const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: null };
|
||
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
|
||
expect(prompt).toContain('前 piece からの引き継ぎ');
|
||
expect(prompt).toContain('前 piece は最終出力を残しませんでした');
|
||
});
|
||
|
||
it('truncates very long prevResult to ~2500 chars (head + tail)', () => {
|
||
// Construct: 3000 'A's + unique 'BBBBBMIDDLE' + 3000 'C's. After
|
||
// truncation: head=2000 of 'A', truncated marker, tail=last 500 of 'C'.
|
||
// The middle marker should be cut out.
|
||
const longResult = 'A'.repeat(3000) + 'BBBBBMIDDLE' + 'C'.repeat(3000);
|
||
const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: longResult };
|
||
const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
|
||
expect(prompt).toContain('[truncated]');
|
||
expect(prompt).toContain('A'.repeat(100)); // head present
|
||
expect(prompt).toContain('C'.repeat(100)); // tail present
|
||
expect(prompt).not.toContain('BBBBBMIDDLE'); // middle was cut
|
||
});
|
||
});
|