sync: update from private repo (5091989)

2026-06-10 10:15:49 +00:00 · 2026-06-10 10:15:49 +00:00 · 000a2474aa
commit 000a2474aa
parent b5831943a4
2 changed files with 154 additions and 46 deletions
--- a/src/engine/reflection/llm-client.test.ts
+++ b/src/engine/reflection/llm-client.test.ts
@ -1,5 +1,5 @@
-import { describe, it, expect, vi, afterEach } from 'vitest';
+import { describe, it, expect, vi, afterEach, beforeEach } from 'vitest';
-import { callReflectionLlm } from './llm-client.js';
+import { callReflectionLlm, setReflectionRetrySleep } from './llm-client.js';
 import type { ReflectionLlmConfig } from './llm-client.js';
 const cfg: ReflectionLlmConfig = {
@ -7,18 +7,15 @@ const cfg: ReflectionLlmConfig = {
  model: 'test-model',
 };
-afterEach(() => {
+const validResult = {
  vi.unstubAllGlobals();
 });
 describe('callReflectionLlm', () => {
  it('happy path: parses tool_call arguments and extracts token usage', async () => {
    const validResult = {
  memory_changes: [],
  piece_changes: { should_edit: false },
  reasoning: 'x',
-    };
+};
-    const mockResponse = {
+
 const okResponse = {
  ok: true,
  json: () => Promise.resolve({
    choices: [
      {
        message: {
@ -33,16 +30,22 @@ describe('callReflectionLlm', () => {
        },
      },
    ],
-      usage: {
+    usage: { prompt_tokens: 42, completion_tokens: 17 },
-        prompt_tokens: 42,
+  }),
-        completion_tokens: 17,
+};
      },
    };
-    vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
+beforeEach(() => {
-      ok: true,
+  // No real backoff sleeps in tests.
-      json: () => Promise.resolve(mockResponse),
+  setReflectionRetrySleep(async () => {});
-    }));
+});
 afterEach(() => {
  vi.unstubAllGlobals();
 });
 describe('callReflectionLlm', () => {
  it('happy path: parses tool_call arguments and extracts token usage', async () => {
    vi.stubGlobal('fetch', vi.fn().mockResolvedValue(okResponse));
    const result = await callReflectionLlm(cfg, 'system prompt', 'user prompt');
@ -54,21 +57,70 @@ describe('callReflectionLlm', () => {
    expect(result.durationMs).toBeGreaterThanOrEqual(0);
  });
-  it('error path: throws when no tool_calls present', async () => {
+  it('retries a 5xx (backend tool-call parse failure) and succeeds on resample', async () => {
-    const mockResponse = {
+    const fetchMock = vi.fn()
-      choices: [
+      .mockResolvedValueOnce({
-        {
+        ok: false,
-          message: {},
+        status: 500,
-        },
+        text: () => Promise.resolve('{"error":{"message":"Failed to parse input at pos 41: <tool_call>..."}}'),
-      ],
+      })
-    };
+      .mockResolvedValueOnce(okResponse);
    vi.stubGlobal('fetch', fetchMock);
-    vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
+    const result = await callReflectionLlm(cfg, 's', 'u');
    expect(result.parsed.reasoning).toBe('x');
    expect(fetchMock).toHaveBeenCalledTimes(2);
  });
  it('gives up after 3 attempts of persistent 5xx', async () => {
    const fetchMock = vi.fn().mockResolvedValue({
      ok: false,
      status: 500,
      text: () => Promise.resolve('parse error'),
    });
    vi.stubGlobal('fetch', fetchMock);
    await expect(callReflectionLlm(cfg, 's', 'u')).rejects.toThrow('HTTP 500');
    expect(fetchMock).toHaveBeenCalledTimes(3);
  });
  it('does NOT retry a 4xx (deterministic config error, e.g. invalid api key)', async () => {
    const fetchMock = vi.fn().mockResolvedValue({
      ok: false,
      status: 401,
      text: () => Promise.resolve('invalid api key'),
    });
    vi.stubGlobal('fetch', fetchMock);
    await expect(callReflectionLlm(cfg, 's', 'u')).rejects.toThrow('HTTP 401');
    expect(fetchMock).toHaveBeenCalledTimes(1);
  });
  it('retries when no tool_calls present, then throws after exhaustion', async () => {
    const fetchMock = vi.fn().mockResolvedValue({
      ok: true,
-      json: () => Promise.resolve(mockResponse),
+      json: () => Promise.resolve({ choices: [{ message: {} }] }),
-    }));
+    });
    vi.stubGlobal('fetch', fetchMock);
    await expect(callReflectionLlm(cfg, 'system prompt', 'user prompt'))
      .rejects.toThrow('no tool_call');
    expect(fetchMock).toHaveBeenCalledTimes(3);
  });
  it('retries malformed tool_call arguments JSON', async () => {
    const fetchMock = vi.fn()
      .mockResolvedValueOnce({
        ok: true,
        json: () => Promise.resolve({
          choices: [{ message: { tool_calls: [{ function: { name: 'submit_reflection', arguments: '{broken' } }] } }],
        }),
      })
      .mockResolvedValueOnce(okResponse);
    vi.stubGlobal('fetch', fetchMock);
    const result = await callReflectionLlm(cfg, 's', 'u');
    expect(result.parsed.reasoning).toBe('x');
    expect(fetchMock).toHaveBeenCalledTimes(2);
  });
 });
--- a/src/engine/reflection/llm-client.ts
+++ b/src/engine/reflection/llm-client.ts
@ -16,12 +16,58 @@ export interface ReflectionLlmResult {
  raw: unknown;
 }
 /** Total attempts (1 initial + retries) for resample-worthy failures. */
 const MAX_ATTEMPTS = 3;
 /** Backoff before attempt 2 and 3. Injectable for tests. */
 const RETRY_DELAYS_MS = [500, 1500];
 let sleep = (ms: number) => new Promise<void>((r) => setTimeout(r, ms));
 /** Test hook: replace the backoff sleeper (avoids real timers in vitest). */
 export function setReflectionRetrySleep(fn: (ms: number) => Promise<void>): void {
  sleep = fn;
 }
 /**
 * Errors worth a resample: small reflection models occasionally emit
 * malformed tool-call markup, which strict backends (e.g. llama-server's
 * tool parser) reject with a 5xx like
 * "Failed to parse input at pos 41: <tool_call>...". The sampling is
 * stochastic (temperature 0.2), so simply asking again usually succeeds.
 * 4xx (bad key, bad request shape) is deterministic config error — fail fast.
 */
 class RetryableLlmError extends Error {}
 export async function callReflectionLlm(
  cfg: ReflectionLlmConfig,
  systemPrompt: string,
  userPrompt: string
 ): Promise<ReflectionLlmResult> {
  const start = Date.now();
  let lastErr: Error | null = null;
  for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
    try {
      return await callOnce(cfg, systemPrompt, userPrompt, start);
    } catch (e) {
      if (!(e instanceof RetryableLlmError)) throw e;
      lastErr = e;
      if (attempt < MAX_ATTEMPTS) {
        const delay = RETRY_DELAYS_MS[attempt - 1] ?? 1500;
        logger.warn(
          `[reflection-llm] attempt ${attempt}/${MAX_ATTEMPTS} failed (${e.message.slice(0, 200)}); retrying in ${delay}ms`,
        );
        await sleep(delay);
      }
    }
  }
  throw lastErr ?? new Error('reflection LLM failed');
 }
 async function callOnce(
  cfg: ReflectionLlmConfig,
  systemPrompt: string,
  userPrompt: string,
  start: number,
 ): Promise<ReflectionLlmResult> {
  const body: Record<string, unknown> = {
    messages: [
      { role: 'system', content: systemPrompt },
@ -43,12 +89,22 @@ export async function callReflectionLlm(
    body: JSON.stringify(body),
  });
  if (!resp.ok) {
-    throw new Error(`reflection LLM HTTP ${resp.status}: ${await resp.text()}`);
+    const text = await resp.text();
    const msg = `reflection LLM HTTP ${resp.status}: ${text}`;
    // 5xx: backend-side failure (incl. tool-call parse errors on malformed
    // model output) — resample. 4xx: deterministic config error — fail fast.
    if (resp.status >= 500) throw new RetryableLlmError(msg);
    throw new Error(msg);
  }
  const data = await resp.json() as any;
  const toolCall = data.choices?.[0]?.message?.tool_calls?.[0];
-  if (!toolCall) throw new Error('reflection LLM returned no tool_call');
+  if (!toolCall) throw new RetryableLlmError('reflection LLM returned no tool_call');
-  const parsed = JSON.parse(toolCall.function.arguments) as ReflectionResult;
+  let parsed: ReflectionResult;
  try {
    parsed = JSON.parse(toolCall.function.arguments) as ReflectionResult;
  } catch {
    throw new RetryableLlmError('reflection LLM tool_call arguments were not valid JSON');
  }
  return {
    parsed,
    tokensIn: data.usage?.prompt_tokens ?? 0,