maestro/src/engine/agent-loop.test.ts

import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import type { LLMEvent, ToolDef } from '../llm/openai-compat.js';
import type { ToolContext } from './tools/index.js';
import { stripThinkingTokens } from './agent-loop.js';

describe('stripThinkingTokens', () => {
  it('strips <think>...</think> blocks', () => {
    expect(stripThinkingTokens('<think>reasoning here</think>actual output')).toBe('actual output');
  });
  it('strips <|thinking|>...</|thinking|> blocks', () => {
    expect(stripThinkingTokens('<|thinking|>deep thought<|/thinking|>result')).toBe('result');
  });
  it('strips gemma4 thought<channel|> pattern', () => {
    expect(stripThinkingTokens('thought\n<channel|>')).toBe('');
    expect(stripThinkingTokens('thought <channel|>hello')).toBe('hello');
  });
  it('strips <channel|>...<channel|> blocks', () => {
    expect(stripThinkingTokens('<channel|>internal reasoning<channel|>visible output')).toBe('visible output');
  });
  it('returns normal text unchanged', () => {
    expect(stripThinkingTokens('hello world')).toBe('hello world');
  });
  it('handles multiline thinking blocks', () => {
    expect(stripThinkingTokens('<think>\nline1\nline2\n</think>output')).toBe('output');
  });
});

const { executeToolMock, getToolDefsMock } = vi.hoisted(() => ({
  executeToolMock: vi.fn(),
  getToolDefsMock: vi.fn(),
}));

vi.mock('./tools/index.js', () => ({
  executeTool: executeToolMock,
  getToolDefs: getToolDefsMock,
}));

import { executeMovement, type Movement } from './agent-loop.js';
import { ToolResultCache } from './context/tool-result-cache.js';
import { WorkspaceMemory } from './context/workspace-memory.js';

function makeMovement(allowedTools: string[]): Movement {
  return {
    name: 'execute',
    edit: true,
    persona: 'worker',
    instruction: 'Do the work.',
    allowedTools,
    rules: [{ condition: 'done', next: 'COMPLETE' }],
    defaultNext: 'COMPLETE',
  };
}

function makeToolDefs(names: string[]): ToolDef[] {
  return names.map((name) => ({
    type: 'function',
    function: {
      name,
      description: name,
      parameters: { type: 'object', properties: {}, required: [] },
    },
  }));
}

function makeContext(): ToolContext {
  return {
    workspacePath: '/tmp/agent-loop-test',
    editAllowed: true,
  };
}

function deferred<T>(): { promise: Promise<T>; resolve: (value: T) => void } {
  let resolve!: (value: T) => void;
  const promise = new Promise<T>((res) => {
    resolve = res;
  });
  return { promise, resolve };
}

async function waitFor(predicate: () => boolean, timeoutMs: number = 250): Promise<void> {
  const start = Date.now();
  while (!predicate()) {
    if (Date.now() - start > timeoutMs) {
      throw new Error('Timed out waiting for condition');
    }
    await new Promise((resolve) => setTimeout(resolve, 0));
  }
}

class FakeClient {
  private index = 0;
  readonly calls: Array<{ messages: unknown; tools?: unknown }> = [];

  constructor(private readonly responses: LLMEvent[][]) {}

  async *chat(messages: unknown, tools?: unknown, _signal?: AbortSignal): AsyncGenerator<LLMEvent> {
    this.calls.push({ messages, tools });
    const response = this.responses[this.index++] ?? [];
    for (const event of response) {
      yield event;
    }
  }
}

describe('executeMovement parallel tool execution', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('runs consecutive safe tool calls in parallel', async () => {
    const readGate = deferred<{ output: string; isError: boolean }>();
    const globGate = deferred<{ output: string; isError: boolean }>();
    const started: string[] = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
    executeToolMock.mockImplementation(async (name: string) => {
      started.push(name);
      if (name === 'Read') return readGate.promise;
      if (name === 'Glob') return globGate.promise;
      return { output: 'ok', isError: false };
    });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
        { type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());

    await waitFor(() => started.length === 2);
    expect(started).toEqual(['Read', 'Glob']);

    readGate.resolve({ output: 'read ok', isError: false });
    globGate.resolve({ output: 'glob ok', isError: false });

    const result = await runPromise;
    expect(result.next).toBe('COMPLETE');
    expect(executeToolMock).toHaveBeenCalledTimes(2);
  });

  it('keeps side-effecting tools sequential as a barrier', async () => {
    const readGate = deferred<{ output: string; isError: boolean }>();
    const writeGate = deferred<{ output: string; isError: boolean }>();
    const started: string[] = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Write']));
    executeToolMock.mockImplementation(async (name: string) => {
      started.push(name);
      if (name === 'Read') return readGate.promise;
      if (name === 'Write') return writeGate.promise;
      return { output: 'ok', isError: false };
    });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
        { type: 'tool_use', id: 'write-1', name: 'Write', input: { file_path: 'output/a.txt', content: 'x' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    const runPromise = executeMovement(makeMovement(['Read', 'Write']), 'task', client as never, makeContext());

    await waitFor(() => started.length === 1);
    expect(started).toEqual(['Read']);

    readGate.resolve({ output: 'read ok', isError: false });
    await waitFor(() => started.length === 2);
    expect(started).toEqual(['Read', 'Write']);

    writeGate.resolve({ output: 'write ok', isError: false });

    const result = await runPromise;
    expect(result.next).toBe('COMPLETE');
    expect(executeToolMock).toHaveBeenCalledTimes(2);
  });

  it('executes regular tools before transition even if transition appears mid-batch', async () => {
    const readGate = deferred<{ output: string; isError: boolean }>();
    const globGate = deferred<{ output: string; isError: boolean }>();
    const started: string[] = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Glob']));
    executeToolMock.mockImplementation(async (name: string) => {
      started.push(name);
      if (name === 'Read') return readGate.promise;
      if (name === 'Glob') return globGate.promise;
      return { output: 'glob ok', isError: false };
    });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'tool_use', id: 'glob-1', name: 'Glob', input: { pattern: 'input/*' } },
        { type: 'done' },
      ],
    ]);

    const runPromise = executeMovement(makeMovement(['Read', 'Glob']), 'task', client as never, makeContext());

    await waitFor(() => started.length === 2);
    expect(started).toEqual(['Read', 'Glob']);

    readGate.resolve({ output: 'read ok', isError: false });
    globGate.resolve({ output: 'glob ok', isError: false });

    const result = await runPromise;
    expect(result.next).toBe('COMPLETE');
    expect(executeToolMock).toHaveBeenCalledTimes(2);
  });

  it('emits text, preview, and tool callbacks in order', async () => {
    const textChunks: string[] = [];
    const previews: Array<{ movement: string; preview: string }> = [];
    const toolUses: Array<{ name: string; input: Record<string, unknown> }> = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });

    const client = new FakeClient([
      [
        { type: 'text', text: 'first ' },
        { type: 'text', text: 'second' },
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/a.txt' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      {
        callbacks: {
          onText: (text) => textChunks.push(text),
          onTextPreview: (movementName, preview) => previews.push({ movement: movementName, preview }),
          onToolUse: (toolName, input) => toolUses.push({ name: toolName, input }),
        },
      },
    );

    expect(result.next).toBe('COMPLETE');
    expect(textChunks).toEqual(['first ', 'second']);
    expect(previews).toEqual([{ movement: 'execute', preview: 'first second' }]);
    expect(toolUses).toContainEqual({ name: 'Read', input: { file_path: 'input/a.txt' } });
  });

  it('fires onToolResult callback after tool execution', async () => {
    const toolResults: Array<{ name: string; isError: boolean }> = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      {
        callbacks: {
          onToolResult: (name, info) => toolResults.push({ name, isError: info.isError }),
        },
      },
    );

    expect(toolResults).toContainEqual({ name: 'Read', isError: false });
  });

  it('fires onMemoryCheckpoint callback after regular tool use', async () => {
    const checkpoints: number[] = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'read ok', isError: false });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      {
        callbacks: {
          onMemoryCheckpoint: (count) => checkpoints.push(count),
        },
      },
    );

    expect(checkpoints).toEqual([1]);
  });

  it('triggers force_transition when context manager signals exhaustion', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const cm = new ContextManager({
      thresholds: [{ ratio: 0.005, action: 'force_transition' }],
      limitTokens: 10_000,
    });

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
        { type: 'done', usage: { prompt_tokens: 80, completion_tokens: 10 } },
      ],
    ]);

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      { contextManager: cm },
    );

    expect(result.next).toBe('COMPLETE');
    expect(result.output).toContain('Context limit reached');
  });

  it('fires onContextAction callback when context threshold crossed', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const cm = new ContextManager({
      thresholds: [{ ratio: 0.005, action: 'warn' }],
      limitTokens: 10_000,
    });

    const contextActions: Array<{ type: string }> = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
        { type: 'done', usage: { prompt_tokens: 60, completion_tokens: 10 } },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      {
        callbacks: {
          onContextAction: (action) => contextActions.push({ type: action.type }),
        },
        contextManager: cm,
      },
    );

    expect(contextActions).toContainEqual({ type: 'warn' });
  });

  it('fires onContextUpdate callback after each LLM usage update', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const cm = new ContextManager({ limitTokens: 100_000 });

    const updates: Array<{ promptTokens: number; limitTokens: number }> = [];

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
        { type: 'done', usage: { prompt_tokens: 1234, completion_tokens: 10 } },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      {
        callbacks: {
          onContextUpdate: (payload) => updates.push(payload),
        },
        contextManager: cm,
      },
    );

    expect(updates.length).toBeGreaterThanOrEqual(1);
    expect(updates).toContainEqual({ promptTokens: 0, limitTokens: 100_000 });
    expect(updates).toContainEqual({ promptTokens: 1234, limitTokens: 100_000 });
  });

  it('compacts oversized tool results before the next LLM request', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const cm = new ContextManager({ limitTokens: 20_000 });
    const hugeBase64 = `<html><img src="data:text/html;base64,${'A'.repeat(80_000)}"></html>`;

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: hugeBase64, isError: false });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'input/page.html' } },
        { type: 'done', usage: { prompt_tokens: 1000, completion_tokens: 10 } },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done', usage: { prompt_tokens: 2000, completion_tokens: 10 } },
      ],
    ]);

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      { contextManager: cm },
    );

    expect(result.next).toBe('COMPLETE');
    expect(client.calls).toHaveLength(2);
    const secondMessages = client.calls[1]!.messages as Array<{ role: string; content?: string }>;
    const toolMessage = secondMessages.find((message) => message.role === 'tool');
    expect(toolMessage?.content).toContain('[Tool result omitted before LLM request]');
    expect(toolMessage?.content).not.toContain(hugeBase64);
    // The agent should also receive a user-facing instruction to retry with a
    // narrower scope, instead of silently dropping content.
    const feedbackMessage = secondMessages.find(
      (message) => message.role === 'user' && message.content?.includes('LLM コンテキストに入れられませんでした'),
    );
    expect(feedbackMessage).toBeDefined();
    expect(feedbackMessage?.content).toMatch(/Read\(offset\/limit\)|Grep/);
  });

  it('dedupes duplicate file reads before compacting tool results', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const cm = new ContextManager({ limitTokens: 40_000 });
    // 80% guard => 32_000 tokens. Each large ASCII read is roughly 20k tokens.
    // Two duplicate reads exceed the guard without dedup; after dedup, only
    // the latest read survives and the request fits.

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));

    const largeContent = 'X'.repeat(72_000);
    executeToolMock.mockResolvedValue({ output: largeContent, isError: false });

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: '/dup.ts' } },
        { type: 'done', usage: { prompt_tokens: 500, completion_tokens: 10 } },
      ],
      [
        { type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: '/dup.ts' } },
        { type: 'done', usage: { prompt_tokens: 18_000, completion_tokens: 10 } },
      ],
      [
        { type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      { contextManager: cm },
    );

    expect(result.next).toBe('COMPLETE');
    expect(client.calls.length).toBe(3);
    // The 3rd LLM call should see the older Read replaced with a placeholder
    const thirdCallMessages = client.calls[2]!.messages as Array<{ role: string; content?: string }>;
    const placeholderMessages = thirdCallMessages.filter(
      (m) => m.role === 'tool' && typeof m.content === 'string' && m.content.includes('Duplicate Read of /dup.ts'),
    );
    expect(placeholderMessages.length).toBe(1);
    // The latest Read should still carry the full content
    const fullReadMessages = thirdCallMessages.filter(
      (m) => m.role === 'tool' && typeof m.content === 'string' && m.content === largeContent,
    );
    expect(fullReadMessages.length).toBe(1);
  });

  it('summarizes old turns when dedup+compact alone cannot fit prompt', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const { SUMMARY_MARKER_PREFIX } = await import('./context/history-compactor.js');
    // Tight context, distinct Bash outputs (so dedup cannot help) each below
    // LARGE_TOOL_RESULT_TOKENS (so compaction cannot help either) — only
    // history summarization can keep the conversation going.
    // Phase 6c expanded the system prompt by ~200 chars (memory_update
    // guidance + new tool definition); we bump the test limit a bit so
    // summarization still fires on the 5th Bash turn rather than the 4th,
    // matching the FakeClient slot layout below.
    const cm = new ContextManager({ limitTokens: 35_000 });
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
    executeToolMock.mockResolvedValue({ output: 'B'.repeat(18_000), isError: false });

    // FakeClient.chat() walks a single response list, so the isolated
    // summarization LLM call (runIsolatedLlm) consumes the next slot. Slot 5
    // is the summary response; slot 6 is the post-summary transition.
    const client = new FakeClient([
      [{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo 1' } }, { type: 'done', usage: { prompt_tokens: 1_000, completion_tokens: 5 } }],
      [{ type: 'tool_use', id: 'b2', name: 'Bash', input: { command: 'echo 2' } }, { type: 'done', usage: { prompt_tokens: 5_500, completion_tokens: 5 } }],
      [{ type: 'tool_use', id: 'b3', name: 'Bash', input: { command: 'echo 3' } }, { type: 'done', usage: { prompt_tokens: 10_000, completion_tokens: 5 } }],
      [{ type: 'tool_use', id: 'b4', name: 'Bash', input: { command: 'echo 4' } }, { type: 'done', usage: { prompt_tokens: 15_000, completion_tokens: 5 } }],
      [{ type: 'tool_use', id: 'b5', name: 'Bash', input: { command: 'echo 5' } }, { type: 'done', usage: { prompt_tokens: 20_000, completion_tokens: 5 } }],
      [{ type: 'text', text: '## ゴール\nrun bash commands\n## 進捗\nDone: 5 commands' }, { type: 'done' }],
      [
        { type: 'tool_use', id: 't', name: 'complete', input: { status: 'success', result: 'done' } },
        { type: 'done' },
      ],
    ]);

    const result = await executeMovement(
      makeMovement(['Bash']),
      'task',
      client as never,
      makeContext(),
      { contextManager: cm },
    );

    expect(result.next).toBe('COMPLETE');
    const finalMessages = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: string }>;
    const markerMessages = finalMessages.filter(
      (m) => m.role === 'user' && typeof m.content === 'string' && m.content.startsWith(SUMMARY_MARKER_PREFIX),
    );
    // Phase 6c expanded the system prompt; the exact iteration where
    // summarization fires shifted slightly. The invariant we care about
    // is "summarization fired at least once before completion".
    expect(markerMessages.length).toBeGreaterThanOrEqual(1);
  });

  it('aborts when initial prompt is oversized and defaultNext is terminal', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const cm = new ContextManager({ limitTokens: 1_000 });

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    // The guard will fail before the main LLM is called, but the
    // last-resort summary path uses runIsolatedLlm (also via FakeClient),
    // so we provide one response for that summary call.
    const client = new FakeClient([
      [{ type: 'text', text: '### Status\nstuck on overflow' }, { type: 'done' }],
    ]);

    const result = await executeMovement(
      makeMovement(['Read']),
      'x'.repeat(10_000),
      client as never,
      makeContext(),
      { contextManager: cm },
    );

    // A terminal fallback would report a false completion, so overflow aborts.
    expect(result.next).toBe('ABORT');
    expect(result.abortCode).toBe('context_overflow');
    expect(result.output).toContain('LLM request blocked before send');
    // Only the isolated summary call should have happened — no main LLM call
    expect(executeToolMock).not.toHaveBeenCalled();
  });

  it('falls back to ABORT when oversized prompt has no defaultNext to force-transition to', async () => {
    const { ContextManager } = await import('./context-manager.js');
    const cm = new ContextManager({ limitTokens: 1_000 });

    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [{ type: 'text', text: 'should not be called' }, { type: 'done' }],
    ]);

    // Movement without defaultNext
    const movement: Movement = {
      name: 'execute',
      edit: true,
      persona: 'worker',
      instruction: 'Do the work.',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
    };

    const result = await executeMovement(
      movement,
      'x'.repeat(10_000),
      client as never,
      makeContext(),
      { contextManager: cm },
    );

    expect(result.next).toBe('ABORT');
    expect(result.output).toContain('LLM request blocked before send');
    expect(executeToolMock).not.toHaveBeenCalled();
  });

  it('returns ABORT immediately when cancelSignal is already aborted', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));

    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
        { type: 'done' },
      ],
    ]);

    const controller = new AbortController();
    controller.abort();

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      { cancelSignal: controller.signal },
    );

    expect(result.next).toBe('ABORT');
    expect(result.output).toContain('cancelled');
    expect(executeToolMock).not.toHaveBeenCalled();
  });

  it('aborts after text-only responses without transition calls', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));

    // LLM returns text-only responses (no tool calls, no transition) 4 times
    // MAX_TEXT_ONLY_RETRIES is 3, so after 3 text-only responses it should ABORT
    const client = new FakeClient([
      [{ type: 'text', text: 'I will do the work' }, { type: 'done' }],
      [{ type: 'text', text: 'Still working on it' }, { type: 'done' }],
      [{ type: 'text', text: 'Almost done' }, { type: 'done' }],
      // Should not reach this
      [{ type: 'text', text: 'Never reached' }, { type: 'done' }],
    ]);

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
    );

    expect(result.next).toBe('ABORT');
    expect(executeToolMock).not.toHaveBeenCalled();
  });

  it('resets text-only counter when tool calls happen in between', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    // Alternate: text-only → tool → text-only → tool → text-only → tool → complete
    // Without the fix, the 3 text-only responses would trigger abort.
    // With the fix, each tool call resets the counter so it never reaches 3 consecutive.
    const client = new FakeClient([
      [{ type: 'text', text: 'Thinking about the answer...' }, { type: 'done' }],
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'a.txt' } },
        { type: 'done' },
      ],
      [{ type: 'text', text: 'Found some info, analyzing...' }, { type: 'done' }],
      [
        { type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'b.txt' } },
        { type: 'done' },
      ],
      [{ type: 'text', text: 'Almost done with analysis' }, { type: 'done' }],
      [
        { type: 'tool_use', id: 'complete-1', name: 'complete', input: { status: 'success', result: 'Done' } },
        { type: 'done' },
      ],
    ]);

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
    );

    expect(result.next).not.toBe('ABORT');
    expect(executeToolMock).toHaveBeenCalled();
  });

  it('aborts when maxIterations is exceeded', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    // LLM keeps using tools but never calls transition
    const toolResponses = Array.from({ length: 4 }, () => [
      { type: 'tool_use' as const, id: `read-${Math.random()}`, name: 'Read', input: { file_path: 'a.txt' } },
      { type: 'done' as const },
    ]);

    const client = new FakeClient(toolResponses);

    const result = await executeMovement(
      makeMovement(['Read']),
      'task',
      client as never,
      makeContext(),
      { maxIterations: 2 },
    );

    expect(result.next).toBe('ABORT');
    expect(result.output).toContain('iteration');
  });

});

describe('executeMovement cross-movement Read cache', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('returns a cached Read result when a later movement reads the same file', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));

    const fileContent = 'const answer = 42;\n';
    executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
      if (name === 'Read' && input['file_path'] === 'foo.ts') {
        return { output: fileContent, isError: false };
      }
      return { output: 'unexpected', isError: true };
    });

    const cache = new ToolResultCache();

    const movementA: Movement = {
      name: 'investigate',
      edit: false,
      persona: 'investigator',
      instruction: 'Look at foo.ts.',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'plan' }],
      defaultNext: 'plan',
    };
    const clientA = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'foo.ts' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'transition', input: { next_step: 'plan', summary: 'foo.ts read' } },
        { type: 'done' },
      ],
    ]);
    const resultA = await executeMovement(movementA, 'task', clientA as never, makeContext(), {
      toolResultCache: cache,
    });
    expect(resultA.next).toBe('plan');
    expect(executeToolMock).toHaveBeenCalledTimes(1);
    expect(cache.size()).toBe(1);

    const observedToolResults: string[] = [];
    const movementB: Movement = {
      name: 'plan',
      edit: false,
      persona: 'planner',
      instruction: 'Plan based on foo.ts.',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [
        { type: 'tool_use', id: 'read-2', name: 'Read', input: { file_path: 'foo.ts' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-2', name: 'complete', input: { status: 'success', result: 'used cached read' } },
        { type: 'done' },
      ],
    ]);
    const resultB = await executeMovement(movementB, 'task', clientB as never, makeContext(), {
      toolResultCache: cache,
      callbacks: {
        onToolResult: (_name, info) => observedToolResults.push(info.result),
      },
    });

    expect(resultB.next).toBe('COMPLETE');
    // executeTool was NOT invoked again — the cache short-circuited the second Read.
    expect(executeToolMock).toHaveBeenCalledTimes(1);
    // The LLM saw a cached-tagged result, not the raw content.
    expect(observedToolResults.some((r) => r.startsWith('[cached: Read foo.ts from movement investigate at '))).toBe(true);
    expect(observedToolResults.some((r) => r.endsWith(fileContent))).toBe(true);
  });

  it('does not cache error results', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ENOENT: no such file', isError: true });

    const cache = new ToolResultCache();
    const movement: Movement = {
      name: 'investigate',
      edit: false,
      persona: 'investigator',
      instruction: 'Look at missing.ts.',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'read-1', name: 'Read', input: { file_path: 'missing.ts' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'tried' } },
        { type: 'done' },
      ],
    ]);
    await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(0);
  });

  it('skips caching tools outside the cacheable allowlist (Bash etc.)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Bash']));
    executeToolMock.mockResolvedValue({ output: 'shell ok', isError: false });

    const cache = new ToolResultCache();
    const movement: Movement = {
      name: 'investigate',
      edit: false,
      persona: 'investigator',
      instruction: 'Run.',
      allowedTools: ['Bash'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'b-1', name: 'Bash', input: { command: 'echo hi' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'transition-1', name: 'complete', input: { status: 'success', result: 'ran' } },
        { type: 'done' },
      ],
    ]);
    await executeMovement(movement, 'task', client as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(0);
  });
});

describe('executeMovement Phase 2 cache invalidation', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('invalidates cached Read after a successful Edit on the same path', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));

    const reads: number[] = [];
    let version = 1;
    executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
      if (name === 'Read' && input['file_path'] === 'foo.ts') {
        reads.push(version);
        return { output: `version=${version}`, isError: false };
      }
      if (name === 'Edit' && input['file_path'] === 'foo.ts') {
        version++;
        return { output: 'edited', isError: false };
      }
      return { output: 'unexpected', isError: true };
    });

    const cache = new ToolResultCache();

    const movementA: Movement = {
      name: 'investigate',
      edit: true,
      persona: 'p',
      instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 'read' } }, { type: 'done' }],
    ]);
    await executeMovement(movementA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(1);

    const movementB: Movement = {
      name: 'execute',
      edit: true,
      persona: 'p',
      instruction: 'i',
      allowedTools: ['Edit'],
      rules: [{ condition: 'done', next: 'verify' }],
      defaultNext: 'verify',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'transition', input: { next_step: 'verify', summary: 'edited' } }, { type: 'done' }],
    ]);
    await executeMovement(movementB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(0); // Edit evicted the Read entry

    const observed: string[] = [];
    const movementC: Movement = {
      name: 'verify',
      edit: false,
      persona: 'p',
      instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientC = new FakeClient([
      [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't3', name: 'complete', input: { status: 'success', result: 're-read' } }, { type: 'done' }],
    ]);
    await executeMovement(movementC, 'task', clientC as never, makeContext(), {
      toolResultCache: cache,
      callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
    });

    // Re-read happened (no cache hit) and saw the post-edit version.
    expect(reads).toEqual([1, 2]);
    expect(observed.some((r) => r === 'version=2')).toBe(true);
    expect(observed.some((r) => r.startsWith('[cached:'))).toBe(false);
  });

  it('invalidates only the affected path; other cached files survive', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));

    const callLog: Array<{ name: string; path?: string }> = [];
    executeToolMock.mockImplementation(async (name: string, input: Record<string, unknown>) => {
      callLog.push({ name, path: input['file_path'] as string | undefined });
      if (name === 'Read') return { output: `${input['file_path']} body`, isError: false };
      if (name === 'Edit') return { output: 'edited', isError: false };
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();
    const movA: Movement = {
      name: 'investigate', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(2);

    const movB: Movement = {
      name: 'execute', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Edit', 'Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const observed: string[] = [];
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
      [
        { type: 'tool_use', id: 'r3', name: 'Read', input: { file_path: 'foo.ts' } },
        { type: 'tool_use', id: 'r4', name: 'Read', input: { file_path: 'bar.ts' } },
        { type: 'done' },
      ],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, makeContext(), {
      toolResultCache: cache,
      callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
    });

    const fooReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'foo.ts').length;
    const barReadCount = callLog.filter((c) => c.name === 'Read' && c.path === 'bar.ts').length;
    expect(fooReadCount).toBe(2); // re-read after Edit
    expect(barReadCount).toBe(1); // bar still cached
    expect(observed.some((r) => r.startsWith('[cached: Read bar.ts'))).toBe(true);
    expect(observed.some((r) => r.startsWith('[cached: Read foo.ts'))).toBe(false);
  });

  it('Bash invalidates every file-derived entry conservatively', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Bash']));

    let readCalls = 0;
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'Read') {
        readCalls++;
        return { output: `read ${readCalls}`, isError: false };
      }
      if (name === 'Bash') return { output: 'shell ok', isError: false };
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();

    const movA: Movement = {
      name: 'investigate', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'bar.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(2);

    const movB: Movement = {
      name: 'execute', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Bash'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'b1', name: 'Bash', input: { command: 'echo hi' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(0); // every Read evicted
  });

  it('does NOT invalidate when Edit/Write/Bash returned an error', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));

    let editAttempts = 0;
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'Read') return { output: 'body', isError: false };
      if (name === 'Edit') {
        editAttempts++;
        return { output: 'permission denied', isError: true };
      }
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();
    const movA: Movement = {
      name: 'investigate', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
    expect(cache.size()).toBe(1);

    const movB: Movement = {
      name: 'execute', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Edit'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });

    expect(editAttempts).toBe(1);
    expect(cache.size()).toBe(1); // failed Edit kept the cache intact
  });
});

describe('executeMovement Phase 3 WorkspaceMemory', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('persists memory_update from one movement and exposes it in the next system prompt', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'body', isError: false });

    const memory = new WorkspaceMemory();

    const movementA: Movement = {
      name: 'investigate',
      edit: false,
      persona: 'investigator',
      instruction: 'Find the bug.',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'plan' }],
      defaultNext: 'plan',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [
        {
          type: 'tool_use',
          id: 't1',
          name: 'transition',
          input: {
            next_step: 'plan',
            summary: 'investigated',
            memory_update: {
              facts: [
                { claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'], confidence: 'high' },
              ],
              decisions: [{ text: 'patch foo.ts directly', evidence_paths: ['foo.ts'] }],
              do_not_repeat: ['re-read foo.ts unless evidence breaks'],
            },
          },
        },
        { type: 'done' },
      ],
    ]);
    await executeMovement(movementA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });

    expect(memory.size().facts).toBe(1);
    expect(memory.size().decisions).toBe(1);
    expect(memory.size().doNotRepeat).toBe(1);

    const movementB: Movement = {
      name: 'plan',
      edit: false,
      persona: 'planner',
      instruction: 'Plan the fix.',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'planned' } }, { type: 'done' }],
    ]);
    await executeMovement(movementB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });

    const systemMsg = clientB.calls[0]?.messages as Array<{ role: string; content: string }>;
    expect(systemMsg[0]!.role).toBe('system');
    expect(systemMsg[0]!.content).toContain('## これまでに蓄積した観測');
    expect(systemMsg[0]!.content).toContain('foo.ts uses bar()');
    expect(systemMsg[0]!.content).toContain('patch foo.ts directly');
    expect(systemMsg[0]!.content).toContain('re-read foo.ts unless evidence breaks');
    expect(systemMsg[0]!.content).toContain('memory は再調査禁止の根拠ではなく');
  });

  it('omits memory section when memory is empty', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const movement: Movement = {
      name: 'investigate',
      edit: false,
      persona: 'p',
      instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const client = new FakeClient([
      [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });

    const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
    expect(systemMsg[0]!.content).not.toContain('## これまでに蓄積した観測');
  });

  it('Edit invalidates a fact whose evidence_paths includes the edited file', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'Read') return { output: 'body', isError: false };
      if (name === 'Edit') return { output: 'edited', isError: false };
      return { output: 'x', isError: true };
    });

    const memory = new WorkspaceMemory();

    const movA: Movement = {
      name: 'investigate', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [
        {
          type: 'tool_use',
          id: 't1',
          name: 'transition',
          input: {
            next_step: 'execute',
            summary: 'investigated',
            memory_update: {
              facts: [
                { claim: 'foo.ts uses bar()', evidence_paths: ['foo.ts'] },
                { claim: 'unrelated truth', evidence_paths: ['bar.ts'] },
              ],
            },
          },
        },
        { type: 'done' },
      ],
    ]);
    await executeMovement(movA, 'task', clientA as never, makeContext(), { workspaceMemory: memory });
    expect(memory.snapshot().facts).toHaveLength(2);

    const movB: Movement = {
      name: 'execute', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Edit'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 'edited' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, makeContext(), { workspaceMemory: memory });

    const snapshotAfter = memory.snapshot();
    expect(snapshotAfter.facts).toHaveLength(1);
    expect(snapshotAfter.facts[0]!.claim).toBe('unrelated truth');
  });

  it('still works when piece submits no memory_update (backward compat)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'body', isError: false });

    const memory = new WorkspaceMemory();
    const movement: Movement = {
      name: 'investigate',
      edit: false,
      persona: 'p',
      instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const client = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 'just summary' } }, { type: 'done' }],
    ]);
    const result = await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });

    expect(result.next).toBe('COMPLETE');
    expect(memory.size().facts).toBe(0);
  });
});

describe('executeMovement Phase 4 cache extension', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('caches Grep results and serves them to a later movement', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep']));
    let grepCalls = 0;
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'Grep') {
        grepCalls++;
        return { output: 'matches in src/foo.ts:1\n', isError: false };
      }
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();

    const movA: Movement = {
      name: 'investigate', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Grep'],
      rules: [{ condition: 'done', next: 'plan' }],
      defaultNext: 'plan',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });
    expect(grepCalls).toBe(1);

    const observed: string[] = [];
    const movB: Movement = {
      name: 'plan', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Grep'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'foo', path: 'src/' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, makeContext(), {
      toolResultCache: cache,
      callbacks: { onToolResult: (_n, info) => observed.push(info.result) },
    });

    expect(grepCalls).toBe(1);
    expect(observed.some((r) => r.startsWith('[cached: Grep foo'))).toBe(true);
  });

  it('Edit invalidates ALL search entries (Grep) regardless of scope', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Grep', 'Edit']));
    let grepCalls = 0;
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'Grep') {
        grepCalls++;
        return { output: `match v${grepCalls}`, isError: false };
      }
      if (name === 'Edit') return { output: 'edited', isError: false };
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();

    const movA: Movement = {
      name: 'investigate', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Grep'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'g1', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });

    const movB: Movement = {
      name: 'execute', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Edit', 'Grep'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'unrelated.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'g2', name: 'Grep', input: { pattern: 'x', path: 'src/' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });

    expect(grepCalls).toBe(2);
  });

  it('caches WebFetch and survives Edit invalidation', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['WebFetch', 'Edit']));
    let fetchCalls = 0;
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'WebFetch') {
        fetchCalls++;
        return { output: 'page body', isError: false };
      }
      if (name === 'Edit') return { output: 'edited', isError: false };
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();

    const movA: Movement = {
      name: 'investigate', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['WebFetch'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'w1', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'execute', summary: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movA, 'task', clientA as never, makeContext(), { toolResultCache: cache });

    const movB: Movement = {
      name: 'execute', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Edit', 'WebFetch'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'w2', name: 'WebFetch', input: { url: 'https://example.com/x' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, makeContext(), { toolResultCache: cache });

    expect(fetchCalls).toBe(1);
  });

  it('caches Office tool results (ReadPdf) keyed by path + range', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['ReadPdf']));
    let pdfCalls = 0;
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'ReadPdf') {
        pdfCalls++;
        return { output: 'pdf body', isError: false };
      }
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();

    const mov: Movement = {
      name: 'investigate', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['ReadPdf'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'p1', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(mov, 'task', clientA as never, makeContext(), { toolResultCache: cache });
    expect(pdfCalls).toBe(1);

    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'p2', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '1-3' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'p3', name: 'ReadPdf', input: { path: 'doc.pdf', page_range: '4-6' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't2', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(mov, 'task', clientB as never, makeContext(), { toolResultCache: cache });
    expect(pdfCalls).toBe(2);
  });
});

describe('buildSystemPrompt checklist guidance', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('emits a "進捗管理" section with strengthened CreateChecklist guidance', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const movement: Movement = {
      name: 'investigate', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const client = new FakeClient([
      [{ type: 'tool_use', id: 't1', name: 'complete', input: { status: 'success', result: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movement, 'task', client as never, makeContext());

    const systemMsg = client.calls[0]?.messages as Array<{ role: string; content: string }>;
    expect(systemMsg[0]!.role).toBe('system');
    expect(systemMsg[0]!.content).toContain('## 進捗管理');
    expect(systemMsg[0]!.content).toContain('CreateChecklist');
    expect(systemMsg[0]!.content).toContain('全 piece で常時利用可能');
    // Stronger language (option A): explicit "3 個以上のツール" criterion +
    // "判断に迷ったら作る" + 2 回目以降への明示注意。
    expect(systemMsg[0]!.content).toContain('3 個以上のツール呼び出し');
    expect(systemMsg[0]!.content).toContain('判断に迷ったら作る');
    expect(systemMsg[0]!.content).toContain('2 回目以降のやり取り');
  });
});

describe('checklist watchdog (option B)', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('injects a one-shot reminder after 5 iterations without any checklist tool', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    // 6 iterations of plain Read → 1 iteration with complete. The watchdog
    // should fire on iteration 5 (0-indexed).
    const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown; text?: string }>> = [];
    for (let i = 0; i < 6; i++) {
      responses.push([
        { type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
        { type: 'done' },
      ]);
    }
    responses.push([
      { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
      { type: 'done' },
    ]);
    const client = new FakeClient(responses as never);

    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());

    // Find any user-role message containing the watchdog tag.
    const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
    const watchdogMessages = allMessagesAtFinalCall.filter(
      (m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
    );
    expect(watchdogMessages.length).toBe(1);
  });

  it('does NOT fire when CreateChecklist is called early', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'CreateChecklist']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    // CreateChecklist on iteration 0, then several Read, then complete.
    const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown }>> = [
      [{ type: 'tool_use', id: 'cc', name: 'CreateChecklist', input: { name: 'plan', items: [] } }, { type: 'done' }],
    ];
    for (let i = 0; i < 7; i++) {
      responses.push([
        { type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
        { type: 'done' },
      ]);
    }
    responses.push([
      { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
      { type: 'done' },
    ]);
    const client = new FakeClient(responses as never);

    await executeMovement(makeMovement(['Read', 'CreateChecklist']), 'task', client as never, makeContext());

    const allMessagesAtFinalCall = client.calls[client.calls.length - 1]!.messages as Array<{ role: string; content?: unknown }>;
    const watchdogMessages = allMessagesAtFinalCall.filter(
      (m) => m.role === 'user' && typeof m.content === 'string' && (m.content as string).includes('[checklist watchdog]'),
    );
    expect(watchdogMessages.length).toBe(0);
  });
});

// ============================================================
// Phase 6a — `complete` tool (§7.1–7.7 of design plan)
// ============================================================

describe('Phase 6a: complete tool — §7.1 native basics', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('success status with result becomes the movement output (no chatter contamination)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [
        { type: 'text', text: "I'll figure this out first" }, // chatter to be ignored
        { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'final answer' } },
        { type: 'done' },
      ],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.next).toBe('COMPLETE');
    expect(result.output).toBe('final answer');
    expect(result.output).not.toContain("I'll figure this out first");
  });

  it('aborted status routes via ABORT next', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted', abort_reason: 'tool unavailable' } },
        { type: 'done' },
      ],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.next).toBe('ABORT');
    expect(result.output).toBe('tool unavailable');
  });

  it('needs_user_input routes via ASK next', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [
        {
          type: 'tool_use', id: 'c1', name: 'complete',
          input: { status: 'needs_user_input', missing_info: 'which dataset?', why_no_default: 'multiple options' },
        },
        { type: 'done' },
      ],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.next).toBe('ASK');
    expect(result.output).toBe('which dataset?');
  });

  it('rejects success with empty result and forces retry (no accumulatedText fallback for native)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [
        { type: 'text', text: 'this would have been the fallback' },
        { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'real answer' } },
        { type: 'done' },
      ],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.next).toBe('COMPLETE');
    expect(result.output).toBe('real answer');
    // The fallback chatter must NOT have leaked into the output via retry.
    expect(result.output).not.toContain('this would have been the fallback');
  });

  it('rejects aborted without abort_reason and forces retry', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'aborted' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'aborted', abort_reason: 'real reason' } }, { type: 'done' }],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.next).toBe('ABORT');
    expect(result.output).toBe('real reason');
  });
});

describe('Phase 6a: complete tool — §7.2 precedence rules (§2.5)', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('invalid native complete forces retry (no fallback)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      // First iteration: invalid (empty result) — must retry
      [
        { type: 'text', text: 'this would have been the fallback' },
        { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
        { type: 'done' },
      ],
      // Second iteration: LLM corrects with valid native
      [
        { type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'native corrected' } },
        { type: 'done' },
      ],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.output).toBe('native corrected');
    // accumulatedText must NOT have leaked into the output via retry
    expect(result.output).not.toContain('fallback');
  });

  it('two native completes with conflicting args → retry', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'A' } },
        { type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'B' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'consistent' } },
        { type: 'done' },
      ],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.output).toBe('consistent');
  });

  it('two native completes with identical args → first one used', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'a', name: 'complete', input: { status: 'success', result: 'same answer' } },
        { type: 'tool_use', id: 'b', name: 'complete', input: { status: 'success', result: 'same answer' } },
        { type: 'done' },
      ],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
    expect(result.output).toBe('same answer');
  });

});

describe('Phase 6a: complete tool — §7.4 regressions', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('non-terminal transition (movement-to-movement) still works', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const movement: Movement = {
      name: 'investigate', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'plan' }],
      defaultNext: 'plan',
    };
    const client = new FakeClient([
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 'investigated' } }, { type: 'done' }],
    ]);
    const result = await executeMovement(movement, 'task', client as never, makeContext());
    expect(result.next).toBe('plan');
    expect(result.output).toBe('investigated');
  });

  it('transition({next_step:"COMPLETE"}) is rejected (Phase 6b)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    // FakeClient bypasses tool-schema validation, but the engine's
    // validateTransition still rejects unknown next values, so an LLM
    // emitting a legacy terminal transition gets a tool-result error and
    // must retry. This locks in the no-shim behavior.
    const client = new FakeClient([
      [{
        type: 'tool_use', id: 't1', name: 'transition',
        input: { next_step: 'COMPLETE', summary: 'should not be accepted' },
      }, { type: 'done' }],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'corrected' } }, { type: 'done' }],
    ]);
    const movement: Movement = {
      name: 'investigate', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      // No COMPLETE in rules — legacy terminal cannot succeed.
      rules: [{ condition: 'fix needed', next: 'fix' }],
    };
    const result = await executeMovement(movement, 'task', client as never, makeContext());
    expect(result.output).toBe('corrected');
  });
});

describe('Phase 6a: complete tool — §7.5 memory_update behavior', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('memory_update inside complete is applied to WorkspaceMemory (success)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const client = new FakeClient([
      [{
        type: 'tool_use', id: 'c1', name: 'complete',
        input: {
          status: 'success',
          result: 'done',
          memory_update: { facts: [{ claim: 'X is Y', confidence: 'high' }] },
        },
      }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
    expect(memory.size().facts).toBe(1);
  });

  it('memory_update is NOT applied when complete is invalid and retried', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const client = new FakeClient([
      // Invalid first attempt — memory_update must NOT commit
      [{
        type: 'tool_use', id: 'c1', name: 'complete',
        input: {
          status: 'success',
          result: '',
          memory_update: { facts: [{ claim: 'should not commit', confidence: 'high' }] },
        },
      }, { type: 'done' }],
      // Valid retry — different fact, must commit only this one
      [{
        type: 'tool_use', id: 'c2', name: 'complete',
        input: {
          status: 'success',
          result: 'fixed',
          memory_update: { facts: [{ claim: 'committed once', confidence: 'high' }] },
        },
      }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
    expect(memory.size().facts).toBe(1);
    expect(memory.snapshot().facts[0]!.claim).toBe('committed once');
  });

  it('memory_update applied for status=aborted', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const client = new FakeClient([
      [{
        type: 'tool_use', id: 'c1', name: 'complete',
        input: {
          status: 'aborted',
          abort_reason: 'tool unavailable',
          memory_update: { facts: [{ claim: 'lesson from failure', confidence: 'medium' }] },
        },
      }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
    expect(memory.size().facts).toBe(1);
  });
});

describe('Phase 6a: complete tool — §7.7 v3 critical (Conditional Go) tests', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('all tool_use ids get a tool_result on retry (conversation history integrity)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'fail-1', name: 'complete', input: { status: 'success', result: '' } },
        { type: 'tool_use', id: 'ignored-1', name: 'complete', input: { status: 'success', result: 'X' } },
        { type: 'done' },
      ],
      [
        { type: 'tool_use', id: 'good', name: 'complete', input: { status: 'success', result: 'recovered' } },
        { type: 'done' },
      ],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());

    // Inspect the messages sent on the second call — every prior tool_use must have a tool_result.
    const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; tool_call_id?: string }>;
    const toolResultIds = secondCallMessages.filter((m) => m.role === 'tool').map((m) => m.tool_call_id);
    expect(toolResultIds).toContain('fail-1');
    expect(toolResultIds).toContain('ignored-1');
  });
});

// ============================================================
// Phase 6c — `memory_update` standalone tool
// ============================================================

describe('Phase 6c: memory_update tool', () => {
  afterEach(() => {
    executeToolMock.mockReset();
    getToolDefsMock.mockReset();
  });

  it('mid-movement memory_update commits before terminal selection (visible to LLM next iteration)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const client = new FakeClient([
      // iteration 0: emit memory_update only
      [
        { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
          facts: [{ claim: 'mid-movement observation' }],
        } },
        { type: 'done' },
      ],
      // iteration 1: terminal complete
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
    expect(result.next).toBe('COMPLETE');
    expect(memory.size().facts).toBe(1);
    expect(memory.snapshot().facts[0]!.claim).toBe('mid-movement observation');
  });

  it('returns "no changes committed" tool_result for empty payload', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const observed: string[] = [];
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {} },
        { type: 'done' },
      ],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), {
      workspaceMemory: memory,
    });
    // The next iteration's messages include the tool_result for mu-1.
    const secondCallMessages = client.calls[1]?.messages as Array<{ role: string; content?: unknown; tool_call_id?: string }>;
    const muResult = secondCallMessages.find((m) => m.role === 'tool' && m.tool_call_id === 'mu-1');
    expect(muResult).toBeDefined();
    expect(String(muResult?.content)).toMatch(/no changes committed/);
    expect(memory.size().facts).toBe(0);
  });

  it('exact-claim duplicate within same iteration is merged (Codex Phase 6c §2.5)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
          facts: [{ claim: 'X is Y', evidence_paths: ['a.ts'] }],
        } },
        { type: 'tool_use', id: 'mu-2', name: 'memory_update', input: {
          facts: [{ claim: 'X is Y', evidence_paths: ['b.ts'] }],
        } },
        { type: 'done' },
      ],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
    const snap = memory.snapshot();
    expect(snap.facts).toHaveLength(1);
    // evidence_paths should be union-merged, not duplicated.
    expect(snap.facts[0]!.evidencePaths.sort()).toEqual(['a.ts', 'b.ts']);
  });

  it('memory_update commits even when complete in the same iteration is invalid (retry path)', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const client = new FakeClient([
      // iteration 0: memory_update + invalid complete (empty result) → retry
      [
        { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
          facts: [{ claim: 'observation persists' }],
        } },
        { type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: '' } },
        { type: 'done' },
      ],
      // iteration 1: corrected complete
      [{ type: 'tool_use', id: 'c2', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext(), { workspaceMemory: memory });
    expect(result.next).toBe('COMPLETE');
    // The memory_update from the FIRST iteration should still be committed,
    // even though the iteration retried due to invalid complete args.
    expect(memory.snapshot().facts.map((f) => f.claim)).toContain('observation persists');
  });

  it('memory_update is a META_TOOL — works even when allowed_tools is empty', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs([]));
    const memory = new WorkspaceMemory();
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: {
          facts: [{ claim: 'still works' }],
        } },
        { type: 'done' },
      ],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    const movement: Movement = {
      name: 'execute', edit: false, persona: 'p', instruction: 'i',
      allowedTools: [],   // empty — yet memory_update is in the tool catalog
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    await executeMovement(movement, 'task', client as never, makeContext(), { workspaceMemory: memory });
    expect(memory.size().facts).toBe(1);
  });
});

// ============================================================
// Traceability T-1 — events.jsonl emission scenarios
// ============================================================

import { mkdtempSync, rmSync, readFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { createFileEventLogger, parseEventLine, type EventBase } from '../progress/event-log.js';

function readEvents(workspace: string): EventBase[] {
  const path = join(workspace, 'logs', 'events.jsonl');
  const lines = readFileSync(path, 'utf-8').trim().split('\n').filter((l) => l.length > 0);
  return lines.map((l) => {
    const parsed = parseEventLine(l);
    if (parsed.kind !== 'ok') throw new Error(`unexpected non-ok event line: ${l}`);
    return parsed.event;
  });
}

describe('Traceability T-1: agent-loop emission', () => {
  let workspace: string;

  beforeEach(() => {
    workspace = mkdtempSync(join(tmpdir(), 'trace-emission-'));
  });

  afterEach(() => {
    rmSync(workspace, { recursive: true, force: true });
  });

  function makeContextWithEvents(workspaceLocal: string): ToolContext {
    return {
      workspacePath: workspaceLocal,
      editAllowed: true,
      eventLogger: createFileEventLogger({ workspacePath: workspaceLocal, runId: 'test-run' }),
    };
  }

  it('emits movement_start, tool_call/result pair, and movement_complete for a single Read flow', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'body', isError: false });

    const client = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));

    const events = readEvents(workspace);
    const kinds = events.map((e) => e.kind);
    expect(kinds).toContain('movement_start');
    expect(kinds).toContain('tool_call');
    expect(kinds).toContain('tool_result');
    expect(kinds).toContain('complete');
    expect(kinds).toContain('movement_complete');

    const call = events.find((e) => e.kind === 'tool_call' && e.llmToolCallId === 'r1');
    const result = events.find((e) => e.kind === 'tool_result' && e.llmToolCallId === 'r1');
    expect(call?.correlationId).toBeTruthy();
    expect(result?.correlationId).toBe(call?.correlationId);
    expect((result?.payload as { cacheHit: boolean }).cacheHit).toBe(false);
  });

  it('emits cache_set on first Read and cache_hit on second Read of the same file', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'body', isError: false });

    const cache = new ToolResultCache();
    const ctx = makeContextWithEvents(workspace);

    const movA: Movement = {
      name: 'investigate', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'plan' }],
      defaultNext: 'plan',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 't1', name: 'transition', input: { next_step: 'plan', summary: 's' } }, { type: 'done' }],
    ]);
    await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache });

    const movB: Movement = {
      name: 'plan', edit: false, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'r2', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache });

    const events = readEvents(workspace);
    expect(events.some((e) => e.kind === 'cache_set')).toBe(true);
    expect(events.some((e) => e.kind === 'cache_hit')).toBe(true);
    const cacheHit = events.find((e) => e.kind === 'cache_hit');
    expect((cacheHit?.payload as { sourceMovement: string }).sourceMovement).toBe('investigate');
  });

  it('emits cache_invalidate and memory_invalidate after a successful Edit', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read', 'Edit']));
    executeToolMock.mockImplementation(async (name: string) => {
      if (name === 'Read') return { output: 'body', isError: false };
      if (name === 'Edit') return { output: 'edited', isError: false };
      return { output: 'x', isError: true };
    });

    const cache = new ToolResultCache();
    const memory = new WorkspaceMemory();
    const ctx = makeContextWithEvents(workspace);

    const movA: Movement = {
      name: 'investigate', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Read'],
      rules: [{ condition: 'done', next: 'execute' }],
      defaultNext: 'execute',
    };
    const clientA = new FakeClient([
      [{ type: 'tool_use', id: 'r1', name: 'Read', input: { file_path: 'foo.ts' } }, { type: 'done' }],
      [
        { type: 'tool_use', id: 't1', name: 'transition', input: {
          next_step: 'execute',
          summary: 's',
          memory_update: { facts: [{ claim: 'foo.ts uses bar', evidence_paths: ['foo.ts'] }] },
        } },
        { type: 'done' },
      ],
    ]);
    await executeMovement(movA, 'task', clientA as never, ctx, { toolResultCache: cache, workspaceMemory: memory });

    const movB: Movement = {
      name: 'execute', edit: true, persona: 'p', instruction: 'i',
      allowedTools: ['Edit'],
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
    const clientB = new FakeClient([
      [{ type: 'tool_use', id: 'e1', name: 'Edit', input: { file_path: 'foo.ts', old_string: 'a', new_string: 'b' } }, { type: 'done' }],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(movB, 'task', clientB as never, ctx, { toolResultCache: cache, workspaceMemory: memory });

    const events = readEvents(workspace);
    expect(events.some((e) => e.kind === 'cache_invalidate')).toBe(true);
    expect(events.some((e) => e.kind === 'memory_invalidate')).toBe(true);
  });

  it('emits memory_update_call with counts when the LLM calls memory_update', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const memory = new WorkspaceMemory();
    const ctx = makeContextWithEvents(workspace);
    const client = new FakeClient([
      [
        { type: 'tool_use', id: 'mu-1', name: 'memory_update', input: { facts: [{ claim: 'x' }, { claim: 'y' }] } },
        { type: 'done' },
      ],
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, ctx, { workspaceMemory: memory });

    const events = readEvents(workspace);
    const muCall = events.find((e) => e.kind === 'memory_update_call');
    expect(muCall).toBeDefined();
    const payload = muCall?.payload as { counts: { factsAdded: number } };
    expect(payload.counts.factsAdded).toBe(2);
  });

  it('emits watchdog_fire when no checklist tool is used in 5 iterations', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    executeToolMock.mockResolvedValue({ output: 'ok', isError: false });

    const responses: Array<Array<{ type: string; id?: string; name?: string; input?: unknown }>> = [];
    for (let i = 0; i < 6; i++) {
      responses.push([
        { type: 'tool_use', id: `r-${i}`, name: 'Read', input: { file_path: `f${i}.ts` } },
        { type: 'done' },
      ]);
    }
    responses.push([
      { type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
      { type: 'done' },
    ]);
    const client = new FakeClient(responses as never);
    await executeMovement(makeMovement(['Read']), 'task', client as never, makeContextWithEvents(workspace));

    const events = readEvents(workspace);
    expect(events.some((e) => e.kind === 'watchdog_fire')).toBe(true);
  });

  it('all events share the same runId across one piece run', async () => {
    getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
    const ctx = makeContextWithEvents(workspace);
    const client = new FakeClient([
      [{ type: 'tool_use', id: 'c1', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
    ]);
    await executeMovement(makeMovement(['Read']), 'task', client as never, ctx);

    const events = readEvents(workspace);
    expect(events.length).toBeGreaterThan(0);
    const runIds = new Set(events.map((e) => e.runId));
    expect(runIds.size).toBe(1);
    expect(runIds.has('test-run')).toBe(true);
  });
});

// ---------------------------------------------------------------------------
// Phase 4 (SSH Console): buildSystemPrompt screen injection
// ---------------------------------------------------------------------------
import { buildSystemPrompt, __setActiveSessionLookup, type HandoffContext } from './agent-loop.js';

describe('buildSystemPrompt console injection', () => {
  afterEach(() => {
    __setActiveSessionLookup(null);
  });

  function makeConsoleMovement(allowedTools: string[]): Movement {
    return {
      name: 'm1',
      edit: false,
      persona: 'p',
      instruction: 'i',
      allowedTools,
      rules: [{ condition: 'done', next: 'COMPLETE' }],
      defaultNext: 'COMPLETE',
    };
  }

  it('appends screen block when SshConsole* in allowed_tools and session exists', () => {
    const fakeSession = {
      cols: 80,
      rows: 24,
      snapshotScreen: () => ({ text: 'mock-screen-line-1\nmock-screen-line-2' }),
    };
    __setActiveSessionLookup((_tid: string) => fakeSession);

    const sys = buildSystemPrompt(
      makeConsoleMovement(['SshConsoleEnsure', 'SshConsoleSend', 'SshConsoleSnapshot']),
      1,
      5,
      [],
      undefined,
      null,
      undefined,
      undefined,
      undefined,
      't1',
    );
    expect(sys).toContain('Console screen');
    expect(sys).toContain('mock-screen-line-1');
    expect(sys).toContain('mock-screen-line-2');
  });

  it('does NOT inject when piece does not allow console tools', () => {
    const fakeSession = {
      cols: 80,
      rows: 24,
      snapshotScreen: () => ({ text: 'mock-screen-X' }),
    };
    __setActiveSessionLookup((_tid: string) => fakeSession);

    const sys = buildSystemPrompt(
      makeConsoleMovement(['Read', 'Bash']),
      1,
      5,
      [],
      undefined,
      null,
      undefined,
      undefined,
      undefined,
      't1',
    );
    expect(sys).not.toContain('Console screen');
    expect(sys).not.toContain('mock-screen-X');
  });

  it('does NOT inject when no active session is registered for the task', () => {
    __setActiveSessionLookup((_tid: string) => null);

    const sys = buildSystemPrompt(
      makeConsoleMovement(['SshConsoleSend']),
      1,
      5,
      [],
      undefined,
      null,
      undefined,
      undefined,
      undefined,
      't1',
    );
    expect(sys).not.toContain('Console screen');
  });

  it('does NOT inject when taskId is missing (subtask without local_task binding)', () => {
    const fakeSession = {
      cols: 80,
      rows: 24,
      snapshotScreen: () => ({ text: 'mock-screen' }),
    };
    let lookedUp: string | null = null;
    __setActiveSessionLookup((tid: string) => {
      lookedUp = tid;
      return fakeSession;
    });

    const sys = buildSystemPrompt(
      makeConsoleMovement(['SshConsoleSnapshot']),
      1,
      5,
      [],
      undefined,
      null,
      undefined,
      undefined,
      undefined,
      null,
    );
    expect(sys).not.toContain('Console screen');
    expect(lookedUp).toBeNull();
  });

  it('truncates injected screen to the configured tail length', () => {
    const allLines = Array.from({ length: 200 }, (_, i) => `line-${i + 1}`);
    const fakeSession = {
      cols: 80,
      rows: 24,
      snapshotScreen: () => ({ text: allLines.join('\n') }),
    };
    __setActiveSessionLookup((_tid: string) => fakeSession);

    const sys = buildSystemPrompt(
      makeConsoleMovement(['SshConsoleSend']),
      1,
      5,
      [],
      undefined,
      null,
      undefined,
      undefined,
      undefined,
      't1',
    );
    // Default tail = 24 lines: should contain the last line but not very early lines.
    expect(sys).toContain('line-200');
    expect(sys).toContain('line-177'); // 200 - 24 + 1 = 177 (inclusive tail)
    expect(sys).not.toContain('line-1\n');
    expect(sys).not.toContain('line-50');
  });
});

describe('buildSystemPrompt — handoff blocks', () => {
  // buildSystemPrompt is pure — we can call it directly. Build a minimal
  // Movement that won't trigger errors in the rendering pipeline.
  const movement: Movement = {
    name: 'investigate',
    persona: 'investigator',
    instruction: 'do the thing',
    rules: [{ condition: 'done', next: 'plan' }],
    allowedTools: [],
    edit: false,
  };

  it('always includes the static Continue block', () => {
    const prompt = buildSystemPrompt(movement);
    expect(prompt).toContain('Continue 機能');
    expect(prompt).toContain('output/ ファイルは次の piece でもそのまま参照されます');
  });

  it('omits the dynamic block when handoffContext is not provided', () => {
    const prompt = buildSystemPrompt(movement);
    expect(prompt).not.toContain('前 piece からの引き継ぎ');
  });

  it('includes the dynamic block with prev piece + result when handoffContext is provided', () => {
    const handoff: HandoffContext = {
      prevPiece: 'manual-writer',
      prevResult: 'output/manual.md is ready, with 5 sections covering setup',
    };
    const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
    expect(prompt).toContain('前 piece からの引き継ぎ');
    expect(prompt).toContain('manual-writer');
    expect(prompt).toContain('output/manual.md is ready');
  });

  it('handles null prevResult gracefully', () => {
    const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: null };
    const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
    expect(prompt).toContain('前 piece からの引き継ぎ');
    expect(prompt).toContain('前 piece は最終出力を残しませんでした');
  });

  it('truncates very long prevResult to ~2500 chars (head + tail)', () => {
    // Construct: 3000 'A's + unique 'BBBBBMIDDLE' + 3000 'C's. After
    // truncation: head=2000 of 'A', truncated marker, tail=last 500 of 'C'.
    // The middle marker should be cut out.
    const longResult = 'A'.repeat(3000) + 'BBBBBMIDDLE' + 'C'.repeat(3000);
    const handoff: HandoffContext = { prevPiece: 'manual-writer', prevResult: longResult };
    const prompt = buildSystemPrompt(movement, 1, 5, [], undefined, null, undefined, undefined, undefined, null, handoff);
    expect(prompt).toContain('[truncated]');
    expect(prompt).toContain('A'.repeat(100)); // head present
    expect(prompt).toContain('C'.repeat(100)); // tail present
    expect(prompt).not.toContain('BBBBBMIDDLE'); // middle was cut
  });
});