maestro/src/engine/agent-loop.tool-loop.test.ts
2026-06-04 00:34:55 +00:00

163 lines
5.9 KiB
TypeScript

import { afterEach, describe, expect, it, vi } from 'vitest';
import type { LLMEvent, ToolDef } from '../llm/openai-compat.js';
import type { ToolContext } from './tools/index.js';
const { executeToolMock, getToolDefsMock } = vi.hoisted(() => ({
executeToolMock: vi.fn(),
getToolDefsMock: vi.fn(),
}));
vi.mock('./tools/index.js', () => ({
executeTool: executeToolMock,
getToolDefs: getToolDefsMock,
}));
import { executeMovement, type Movement } from './agent-loop.js';
function makeMovement(allowedTools: string[]): Movement {
return {
name: 'execute',
edit: false,
persona: 'worker',
instruction: 'Do the work.',
allowedTools,
rules: [{ condition: 'done', next: 'COMPLETE' }],
defaultNext: 'COMPLETE',
};
}
function makeToolDefs(names: string[]): ToolDef[] {
return names.map((name) => ({
type: 'function',
function: {
name,
description: name,
parameters: { type: 'object', properties: {}, required: [] },
},
}));
}
function makeContext(): ToolContext {
return {
workspacePath: '/tmp/agent-loop-tool-loop-test',
editAllowed: false,
};
}
/** Yields the supplied response scripts; once exhausted, yields empty batches. */
class FakeClient {
private index = 0;
constructor(private readonly responses: LLMEvent[][]) {}
async *chat(_messages: unknown, _tools?: unknown): AsyncGenerator<LLMEvent> {
const response = this.responses[this.index++] ?? [{ type: 'done' } as LLMEvent];
for (const event of response) yield event;
}
}
/** One iteration that calls Read on the same path with the same args. */
function identicalReadBatch(): LLMEvent[] {
return [
{ type: 'tool_use', id: 'read', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'done' },
];
}
describe('executeMovement tool-call loop detection', () => {
afterEach(() => {
executeToolMock.mockReset();
getToolDefsMock.mockReset();
});
it('aborts when the identical tool-call batch repeats up to the limit', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'same content', isError: false });
// 6 identical batches available, but the default limit (5) should fire
// on the 5th before dispatching it → only 4 Read executions.
const client = new FakeClient(Array.from({ length: 6 }, identicalReadBatch));
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('ABORT');
expect(result.abortCode).toBe('tool_loop_detected');
expect(executeToolMock).toHaveBeenCalledTimes(4);
});
it('honours a custom maxToolLoopRepeats from safetyConfig', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'same content', isError: false });
const client = new FakeClient(Array.from({ length: 6 }, identicalReadBatch));
const result = await executeMovement(
makeMovement(['Read']),
'task',
client as never,
makeContext(),
{ safetyConfig: { maxToolLoopRepeats: 3 } },
);
expect(result.next).toBe('ABORT');
expect(result.abortCode).toBe('tool_loop_detected');
// Limit 3 → aborts before the 3rd dispatch → 2 Read executions.
expect(executeToolMock).toHaveBeenCalledTimes(2);
});
it('does not abort when identical calls stay under the limit, then completes', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'same content', isError: false });
// 4 identical Read batches (under the default limit of 5), then complete.
const client = new FakeClient([
...Array.from({ length: 4 }, identicalReadBatch),
[{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(4);
});
it('resets the counter when the tool-call args change (no false positive)', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'content', isError: false });
// 8 Read calls but each on a different file → fingerprint changes every
// iteration → never reaches the consecutive-repeat limit. Then complete.
const varied: LLMEvent[][] = Array.from({ length: 8 }, (_, i) => [
{ type: 'tool_use', id: `read-${i}`, name: 'Read', input: { file_path: `input/file-${i}.txt` } },
{ type: 'done' },
]);
const client = new FakeClient([
...varied,
[{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } }, { type: 'done' }],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('COMPLETE');
expect(executeToolMock).toHaveBeenCalledTimes(8);
});
it('lets transition/complete win even if it shares a batch with a repeated call', async () => {
getToolDefsMock.mockResolvedValue(makeToolDefs(['Read']));
executeToolMock.mockResolvedValue({ output: 'same content', isError: false });
// 4 identical Read batches (warning fires but no abort), then a batch that
// repeats the same Read AND completes — complete must take precedence.
const client = new FakeClient([
...Array.from({ length: 4 }, identicalReadBatch),
[
{ type: 'tool_use', id: 'read', name: 'Read', input: { file_path: 'input/a.txt' } },
{ type: 'tool_use', id: 'c', name: 'complete', input: { status: 'success', result: 'done' } },
{ type: 'done' },
],
]);
const result = await executeMovement(makeMovement(['Read']), 'task', client as never, makeContext());
expect(result.next).toBe('COMPLETE');
});
});