diff --git a/src/engine/reflection/llm-client.test.ts b/src/engine/reflection/llm-client.test.ts
index 8a7fb11..eec9741 100644
--- a/src/engine/reflection/llm-client.test.ts
+++ b/src/engine/reflection/llm-client.test.ts
@@ -1,5 +1,5 @@
-import { describe, it, expect, vi, afterEach } from 'vitest';
-import { callReflectionLlm } from './llm-client.js';
+import { describe, it, expect, vi, afterEach, beforeEach } from 'vitest';
+import { callReflectionLlm, setReflectionRetrySleep } from './llm-client.js';
 import type { ReflectionLlmConfig } from './llm-client.js';
 
 const cfg: ReflectionLlmConfig = {
@@ -7,42 +7,45 @@ const cfg: ReflectionLlmConfig = {
   model: 'test-model',
 };
 
+const validResult = {
+  memory_changes: [],
+  piece_changes: { should_edit: false },
+  reasoning: 'x',
+};
+
+const okResponse = {
+  ok: true,
+  json: () => Promise.resolve({
+    choices: [
+      {
+        message: {
+          tool_calls: [
+            {
+              function: {
+                name: 'submit_reflection',
+                arguments: JSON.stringify(validResult),
+              },
+            },
+          ],
+        },
+      },
+    ],
+    usage: { prompt_tokens: 42, completion_tokens: 17 },
+  }),
+};
+
+beforeEach(() => {
+  // No real backoff sleeps in tests.
+  setReflectionRetrySleep(async () => {});
+});
+
 afterEach(() => {
   vi.unstubAllGlobals();
 });
 
 describe('callReflectionLlm', () => {
   it('happy path: parses tool_call arguments and extracts token usage', async () => {
-    const validResult = {
-      memory_changes: [],
-      piece_changes: { should_edit: false },
-      reasoning: 'x',
-    };
-    const mockResponse = {
-      choices: [
-        {
-          message: {
-            tool_calls: [
-              {
-                function: {
-                  name: 'submit_reflection',
-                  arguments: JSON.stringify(validResult),
-                },
-              },
-            ],
-          },
-        },
-      ],
-      usage: {
-        prompt_tokens: 42,
-        completion_tokens: 17,
-      },
-    };
-
-    vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
-      ok: true,
-      json: () => Promise.resolve(mockResponse),
-    }));
+    vi.stubGlobal('fetch', vi.fn().mockResolvedValue(okResponse));
 
     const result = await callReflectionLlm(cfg, 'system prompt', 'user prompt');
 
@@ -54,21 +57,70 @@ describe('callReflectionLlm', () => {
     expect(result.durationMs).toBeGreaterThanOrEqual(0);
   });
 
-  it('error path: throws when no tool_calls present', async () => {
-    const mockResponse = {
-      choices: [
-        {
-          message: {},
-        },
-      ],
-    };
+  it('retries a 5xx (backend tool-call parse failure) and succeeds on resample', async () => {
+    const fetchMock = vi.fn()
+      .mockResolvedValueOnce({
+        ok: false,
+        status: 500,
+        text: () => Promise.resolve('{"error":{"message":"Failed to parse input at pos 41: <tool_call>..."}}'),
+      })
+      .mockResolvedValueOnce(okResponse);
+    vi.stubGlobal('fetch', fetchMock);
 
-    vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
+    const result = await callReflectionLlm(cfg, 's', 'u');
+    expect(result.parsed.reasoning).toBe('x');
+    expect(fetchMock).toHaveBeenCalledTimes(2);
+  });
+
+  it('gives up after 3 attempts of persistent 5xx', async () => {
+    const fetchMock = vi.fn().mockResolvedValue({
+      ok: false,
+      status: 500,
+      text: () => Promise.resolve('parse error'),
+    });
+    vi.stubGlobal('fetch', fetchMock);
+
+    await expect(callReflectionLlm(cfg, 's', 'u')).rejects.toThrow('HTTP 500');
+    expect(fetchMock).toHaveBeenCalledTimes(3);
+  });
+
+  it('does NOT retry a 4xx (deterministic config error, e.g. invalid api key)', async () => {
+    const fetchMock = vi.fn().mockResolvedValue({
+      ok: false,
+      status: 401,
+      text: () => Promise.resolve('invalid api key'),
+    });
+    vi.stubGlobal('fetch', fetchMock);
+
+    await expect(callReflectionLlm(cfg, 's', 'u')).rejects.toThrow('HTTP 401');
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+  });
+
+  it('retries when no tool_calls present, then throws after exhaustion', async () => {
+    const fetchMock = vi.fn().mockResolvedValue({
       ok: true,
-      json: () => Promise.resolve(mockResponse),
-    }));
+      json: () => Promise.resolve({ choices: [{ message: {} }] }),
+    });
+    vi.stubGlobal('fetch', fetchMock);
 
     await expect(callReflectionLlm(cfg, 'system prompt', 'user prompt'))
       .rejects.toThrow('no tool_call');
+    expect(fetchMock).toHaveBeenCalledTimes(3);
+  });
+
+  it('retries malformed tool_call arguments JSON', async () => {
+    const fetchMock = vi.fn()
+      .mockResolvedValueOnce({
+        ok: true,
+        json: () => Promise.resolve({
+          choices: [{ message: { tool_calls: [{ function: { name: 'submit_reflection', arguments: '{broken' } }] } }],
+        }),
+      })
+      .mockResolvedValueOnce(okResponse);
+    vi.stubGlobal('fetch', fetchMock);
+
+    const result = await callReflectionLlm(cfg, 's', 'u');
+    expect(result.parsed.reasoning).toBe('x');
+    expect(fetchMock).toHaveBeenCalledTimes(2);
   });
 });
diff --git a/src/engine/reflection/llm-client.ts b/src/engine/reflection/llm-client.ts
index 5dd6d58..8d3e141 100644
--- a/src/engine/reflection/llm-client.ts
+++ b/src/engine/reflection/llm-client.ts
@@ -16,12 +16,58 @@ export interface ReflectionLlmResult {
   raw: unknown;
 }
 
+/** Total attempts (1 initial + retries) for resample-worthy failures. */
+const MAX_ATTEMPTS = 3;
+/** Backoff before attempt 2 and 3. Injectable for tests. */
+const RETRY_DELAYS_MS = [500, 1500];
+
+let sleep = (ms: number) => new Promise<void>((r) => setTimeout(r, ms));
+/** Test hook: replace the backoff sleeper (avoids real timers in vitest). */
+export function setReflectionRetrySleep(fn: (ms: number) => Promise<void>): void {
+  sleep = fn;
+}
+
+/**
+ * Errors worth a resample: small reflection models occasionally emit
+ * malformed tool-call markup, which strict backends (e.g. llama-server's
+ * tool parser) reject with a 5xx like
+ * "Failed to parse input at pos 41: <tool_call>...". The sampling is
+ * stochastic (temperature 0.2), so simply asking again usually succeeds.
+ * 4xx (bad key, bad request shape) is deterministic config error — fail fast.
+ */
+class RetryableLlmError extends Error {}
+
 export async function callReflectionLlm(
   cfg: ReflectionLlmConfig,
   systemPrompt: string,
   userPrompt: string
 ): Promise<ReflectionLlmResult> {
   const start = Date.now();
+  let lastErr: Error | null = null;
+  for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
+    try {
+      return await callOnce(cfg, systemPrompt, userPrompt, start);
+    } catch (e) {
+      if (!(e instanceof RetryableLlmError)) throw e;
+      lastErr = e;
+      if (attempt < MAX_ATTEMPTS) {
+        const delay = RETRY_DELAYS_MS[attempt - 1] ?? 1500;
+        logger.warn(
+          `[reflection-llm] attempt ${attempt}/${MAX_ATTEMPTS} failed (${e.message.slice(0, 200)}); retrying in ${delay}ms`,
+        );
+        await sleep(delay);
+      }
+    }
+  }
+  throw lastErr ?? new Error('reflection LLM failed');
+}
+
+async function callOnce(
+  cfg: ReflectionLlmConfig,
+  systemPrompt: string,
+  userPrompt: string,
+  start: number,
+): Promise<ReflectionLlmResult> {
   const body: Record<string, unknown> = {
     messages: [
       { role: 'system', content: systemPrompt },
@@ -43,12 +89,22 @@ export async function callReflectionLlm(
     body: JSON.stringify(body),
   });
   if (!resp.ok) {
-    throw new Error(`reflection LLM HTTP ${resp.status}: ${await resp.text()}`);
+    const text = await resp.text();
+    const msg = `reflection LLM HTTP ${resp.status}: ${text}`;
+    // 5xx: backend-side failure (incl. tool-call parse errors on malformed
+    // model output) — resample. 4xx: deterministic config error — fail fast.
+    if (resp.status >= 500) throw new RetryableLlmError(msg);
+    throw new Error(msg);
   }
   const data = await resp.json() as any;
   const toolCall = data.choices?.[0]?.message?.tool_calls?.[0];
-  if (!toolCall) throw new Error('reflection LLM returned no tool_call');
-  const parsed = JSON.parse(toolCall.function.arguments) as ReflectionResult;
+  if (!toolCall) throw new RetryableLlmError('reflection LLM returned no tool_call');
+  let parsed: ReflectionResult;
+  try {
+    parsed = JSON.parse(toolCall.function.arguments) as ReflectionResult;
+  } catch {
+    throw new RetryableLlmError('reflection LLM tool_call arguments were not valid JSON');
+  }
   return {
     parsed,
     tokensIn: data.usage?.prompt_tokens ?? 0,