diff --git a/src/engine/reflection/llm-client.test.ts b/src/engine/reflection/llm-client.test.ts index 8a7fb11..eec9741 100644 --- a/src/engine/reflection/llm-client.test.ts +++ b/src/engine/reflection/llm-client.test.ts @@ -1,5 +1,5 @@ -import { describe, it, expect, vi, afterEach } from 'vitest'; -import { callReflectionLlm } from './llm-client.js'; +import { describe, it, expect, vi, afterEach, beforeEach } from 'vitest'; +import { callReflectionLlm, setReflectionRetrySleep } from './llm-client.js'; import type { ReflectionLlmConfig } from './llm-client.js'; const cfg: ReflectionLlmConfig = { @@ -7,42 +7,45 @@ const cfg: ReflectionLlmConfig = { model: 'test-model', }; +const validResult = { + memory_changes: [], + piece_changes: { should_edit: false }, + reasoning: 'x', +}; + +const okResponse = { + ok: true, + json: () => Promise.resolve({ + choices: [ + { + message: { + tool_calls: [ + { + function: { + name: 'submit_reflection', + arguments: JSON.stringify(validResult), + }, + }, + ], + }, + }, + ], + usage: { prompt_tokens: 42, completion_tokens: 17 }, + }), +}; + +beforeEach(() => { + // No real backoff sleeps in tests. + setReflectionRetrySleep(async () => {}); +}); + afterEach(() => { vi.unstubAllGlobals(); }); describe('callReflectionLlm', () => { it('happy path: parses tool_call arguments and extracts token usage', async () => { - const validResult = { - memory_changes: [], - piece_changes: { should_edit: false }, - reasoning: 'x', - }; - const mockResponse = { - choices: [ - { - message: { - tool_calls: [ - { - function: { - name: 'submit_reflection', - arguments: JSON.stringify(validResult), - }, - }, - ], - }, - }, - ], - usage: { - prompt_tokens: 42, - completion_tokens: 17, - }, - }; - - vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ - ok: true, - json: () => Promise.resolve(mockResponse), - })); + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(okResponse)); const result = await callReflectionLlm(cfg, 'system prompt', 'user prompt'); @@ -54,21 +57,70 @@ describe('callReflectionLlm', () => { expect(result.durationMs).toBeGreaterThanOrEqual(0); }); - it('error path: throws when no tool_calls present', async () => { - const mockResponse = { - choices: [ - { - message: {}, - }, - ], - }; + it('retries a 5xx (backend tool-call parse failure) and succeeds on resample', async () => { + const fetchMock = vi.fn() + .mockResolvedValueOnce({ + ok: false, + status: 500, + text: () => Promise.resolve('{"error":{"message":"Failed to parse input at pos 41: ..."}}'), + }) + .mockResolvedValueOnce(okResponse); + vi.stubGlobal('fetch', fetchMock); - vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ + const result = await callReflectionLlm(cfg, 's', 'u'); + expect(result.parsed.reasoning).toBe('x'); + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it('gives up after 3 attempts of persistent 5xx', async () => { + const fetchMock = vi.fn().mockResolvedValue({ + ok: false, + status: 500, + text: () => Promise.resolve('parse error'), + }); + vi.stubGlobal('fetch', fetchMock); + + await expect(callReflectionLlm(cfg, 's', 'u')).rejects.toThrow('HTTP 500'); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it('does NOT retry a 4xx (deterministic config error, e.g. invalid api key)', async () => { + const fetchMock = vi.fn().mockResolvedValue({ + ok: false, + status: 401, + text: () => Promise.resolve('invalid api key'), + }); + vi.stubGlobal('fetch', fetchMock); + + await expect(callReflectionLlm(cfg, 's', 'u')).rejects.toThrow('HTTP 401'); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it('retries when no tool_calls present, then throws after exhaustion', async () => { + const fetchMock = vi.fn().mockResolvedValue({ ok: true, - json: () => Promise.resolve(mockResponse), - })); + json: () => Promise.resolve({ choices: [{ message: {} }] }), + }); + vi.stubGlobal('fetch', fetchMock); await expect(callReflectionLlm(cfg, 'system prompt', 'user prompt')) .rejects.toThrow('no tool_call'); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it('retries malformed tool_call arguments JSON', async () => { + const fetchMock = vi.fn() + .mockResolvedValueOnce({ + ok: true, + json: () => Promise.resolve({ + choices: [{ message: { tool_calls: [{ function: { name: 'submit_reflection', arguments: '{broken' } }] } }], + }), + }) + .mockResolvedValueOnce(okResponse); + vi.stubGlobal('fetch', fetchMock); + + const result = await callReflectionLlm(cfg, 's', 'u'); + expect(result.parsed.reasoning).toBe('x'); + expect(fetchMock).toHaveBeenCalledTimes(2); }); }); diff --git a/src/engine/reflection/llm-client.ts b/src/engine/reflection/llm-client.ts index 5dd6d58..8d3e141 100644 --- a/src/engine/reflection/llm-client.ts +++ b/src/engine/reflection/llm-client.ts @@ -16,12 +16,58 @@ export interface ReflectionLlmResult { raw: unknown; } +/** Total attempts (1 initial + retries) for resample-worthy failures. */ +const MAX_ATTEMPTS = 3; +/** Backoff before attempt 2 and 3. Injectable for tests. */ +const RETRY_DELAYS_MS = [500, 1500]; + +let sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); +/** Test hook: replace the backoff sleeper (avoids real timers in vitest). */ +export function setReflectionRetrySleep(fn: (ms: number) => Promise): void { + sleep = fn; +} + +/** + * Errors worth a resample: small reflection models occasionally emit + * malformed tool-call markup, which strict backends (e.g. llama-server's + * tool parser) reject with a 5xx like + * "Failed to parse input at pos 41: ...". The sampling is + * stochastic (temperature 0.2), so simply asking again usually succeeds. + * 4xx (bad key, bad request shape) is deterministic config error — fail fast. + */ +class RetryableLlmError extends Error {} + export async function callReflectionLlm( cfg: ReflectionLlmConfig, systemPrompt: string, userPrompt: string ): Promise { const start = Date.now(); + let lastErr: Error | null = null; + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + try { + return await callOnce(cfg, systemPrompt, userPrompt, start); + } catch (e) { + if (!(e instanceof RetryableLlmError)) throw e; + lastErr = e; + if (attempt < MAX_ATTEMPTS) { + const delay = RETRY_DELAYS_MS[attempt - 1] ?? 1500; + logger.warn( + `[reflection-llm] attempt ${attempt}/${MAX_ATTEMPTS} failed (${e.message.slice(0, 200)}); retrying in ${delay}ms`, + ); + await sleep(delay); + } + } + } + throw lastErr ?? new Error('reflection LLM failed'); +} + +async function callOnce( + cfg: ReflectionLlmConfig, + systemPrompt: string, + userPrompt: string, + start: number, +): Promise { const body: Record = { messages: [ { role: 'system', content: systemPrompt }, @@ -43,12 +89,22 @@ export async function callReflectionLlm( body: JSON.stringify(body), }); if (!resp.ok) { - throw new Error(`reflection LLM HTTP ${resp.status}: ${await resp.text()}`); + const text = await resp.text(); + const msg = `reflection LLM HTTP ${resp.status}: ${text}`; + // 5xx: backend-side failure (incl. tool-call parse errors on malformed + // model output) — resample. 4xx: deterministic config error — fail fast. + if (resp.status >= 500) throw new RetryableLlmError(msg); + throw new Error(msg); } const data = await resp.json() as any; const toolCall = data.choices?.[0]?.message?.tool_calls?.[0]; - if (!toolCall) throw new Error('reflection LLM returned no tool_call'); - const parsed = JSON.parse(toolCall.function.arguments) as ReflectionResult; + if (!toolCall) throw new RetryableLlmError('reflection LLM returned no tool_call'); + let parsed: ReflectionResult; + try { + parsed = JSON.parse(toolCall.function.arguments) as ReflectionResult; + } catch { + throw new RetryableLlmError('reflection LLM tool_call arguments were not valid JSON'); + } return { parsed, tokensIn: data.usage?.prompt_tokens ?? 0,