maestro/src/engine/reflection/llm-client.ts

import { logger } from '../../logger.js';
import { getDefaultProviderRetryConfig } from '../../config.js';
import { OpenAICompatClient, type LLMEvent, type Message, type ToolDef } from '../../llm/openai-compat.js';
import type { ReflectionResult } from './types.js';
import { REFLECTION_TOOL_SCHEMA } from './reflection-schema.js';

export interface ReflectionLlmConfig {
  endpoint: string;
  model: string | undefined;
  apiKey?: string;
  /** True when the reflection worker routes through the AAO Gateway (proxy). */
  proxy?: boolean;
  /** Reflection target user — recorded as the usage owner. */
  userId?: string;
  /**
   * Model context window in tokens. Passed to the shared client's
   * prompt-size preflight guard. Reflection prompts can be large (uncapped
   * memory snapshot), so use the worker's real limit rather than the
   * client's conservative 32k default, which would block valid prompts.
   */
  contextLimitTokens?: number;
}

export interface ReflectionLlmResult {
  parsed: ReflectionResult;
  tokensIn: number;
  tokensOut: number;
  durationMs: number;
  raw: unknown;
}

/** Total attempts (1 initial + retries) for resample-worthy failures. */
const MAX_ATTEMPTS = 3;
/** Backoff before attempt 2 and 3. Injectable for tests. */
const RETRY_DELAYS_MS = [500, 1500];

let sleep = (ms: number) => new Promise<void>((r) => setTimeout(r, ms));
/** Test hook: replace the backoff sleeper (avoids real timers in vitest). */
export function setReflectionRetrySleep(fn: (ms: number) => Promise<void>): void {
  sleep = fn;
}

/**
 * Errors worth a resample: small reflection models occasionally emit
 * malformed tool-call markup, which strict backends (e.g. llama-server's
 * tool parser) reject with a 5xx like
 * "Failed to parse input at pos 41: <tool_call>...". The sampling is
 * stochastic (temperature 0.2), so simply asking again usually succeeds.
 * 4xx (bad key, bad request shape) is deterministic config error — fail fast.
 */
class RetryableLlmError extends Error {}

export async function callReflectionLlm(
  cfg: ReflectionLlmConfig,
  systemPrompt: string,
  userPrompt: string
): Promise<ReflectionLlmResult> {
  const start = Date.now();
  let lastErr: Error | null = null;
  for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
    try {
      return await callOnce(cfg, systemPrompt, userPrompt, start);
    } catch (e) {
      if (!(e instanceof RetryableLlmError)) throw e;
      lastErr = e;
      if (attempt < MAX_ATTEMPTS) {
        const delay = RETRY_DELAYS_MS[attempt - 1] ?? 1500;
        logger.warn(
          `[reflection-llm] attempt ${attempt}/${MAX_ATTEMPTS} failed (${e.message.slice(0, 200)}); retrying in ${delay}ms`,
        );
        await sleep(delay);
      }
    }
  }
  throw lastErr ?? new Error('reflection LLM failed');
}

/**
 * Classify an OpenAICompatClient error for the reflection resample loop.
 *  - HTTP 5xx (incl. tool-call parse errors on malformed model output) and
 *    gateway_shutdown / gateway_timeout: transient → resample.
 *  - HTTP 4xx (bad key / request shape), budget_exhausted / rate_limited
 *    (won't pass until the period resets), and the client-side
 *    "blocked before send" prompt-size guard: deterministic → fail fast.
 *  - Everything else (transport / parse / idle timeout): stochastic → resample.
 */
function classifyClientError(message: string, gatewayErrorType?: string): Error {
  if (gatewayErrorType === 'budget_exhausted' || gatewayErrorType === 'rate_limited') {
    return new Error(message);
  }
  if (gatewayErrorType === 'gateway_shutdown' || gatewayErrorType === 'gateway_timeout') {
    return new RetryableLlmError(message);
  }
  // Client-side preflight rejection — the prompt is too large; resampling the
  // identical prompt cannot help.
  if (message.includes('blocked before send')) {
    return new Error(message);
  }
  const m = /HTTP (\d{3})/.exec(message);
  if (m) {
    const status = Number(m[1]);
    if (status >= 500) return new RetryableLlmError(message);
    return new Error(message);
  }
  return new RetryableLlmError(message);
}

async function callOnce(
  cfg: ReflectionLlmConfig,
  systemPrompt: string,
  userPrompt: string,
  start: number,
): Promise<ReflectionLlmResult> {
  // Route through the shared client so usage lands in the single
  // per-user ledger (gateway + direct) like every other LLM call.
  // maxAttempts=1: the outer callReflectionLlm loop owns resampling.
  const client = new OpenAICompatClient(
    cfg.endpoint,
    cfg.model,
    cfg.apiKey,
    { ...getDefaultProviderRetryConfig(), maxAttempts: 1 },
    undefined,
    cfg.contextLimitTokens, // real model window; avoid the 32k default blocking large reflection prompts
    undefined,
    undefined,
    { proxy: cfg.proxy === true },
  );
  const messages: Message[] = [
    { role: 'system', content: systemPrompt },
    { role: 'user', content: userPrompt },
  ];

  let parsed: ReflectionResult | null = null;
  let usage: { prompt_tokens: number; completion_tokens: number } | undefined;
  let errorMsg: string | null = null;
  let errorGatewayType: string | undefined;
  let backendId: string | undefined;

  for await (const event of client.chat(
    messages,
    [REFLECTION_TOOL_SCHEMA as unknown as ToolDef],
    undefined,
    { userId: cfg.userId },
    { temperature: 0.2, toolChoice: { type: 'function', function: { name: 'submit_reflection' } } },
  ) as AsyncGenerator<LLMEvent>) {
    if (event.type === 'tool_use') {
      if (event.name === 'submit_reflection' && parsed === null) {
        parsed = event.input as unknown as ReflectionResult;
      }
    } else if (event.type === 'done') {
      usage = event.usage;
    } else if (event.type === 'backend') {
      backendId = event.backendId;
    } else if (event.type === 'error') {
      errorMsg = event.error;
      errorGatewayType = event.gatewayErrorType;
    }
  }

  if (errorMsg !== null) {
    throw classifyClientError(`reflection LLM ${errorMsg}`, errorGatewayType);
  }
  if (parsed === null) {
    throw new RetryableLlmError('reflection LLM returned no submit_reflection tool_call');
  }
  // The shared client swallows tool-argument JSON parse errors and yields an
  // empty `{}` input. Preserve the old resample-on-malformed behaviour with a
  // shallow structural check against the tool schema's required fields — a
  // genuinely-empty object means the model emitted broken tool markup.
  const p = parsed as unknown as Record<string, unknown>;
  if (p['piece_changes'] === undefined || p['reasoning'] === undefined) {
    throw new RetryableLlmError('reflection LLM tool_call arguments were malformed or incomplete');
  }
  return {
    parsed,
    tokensIn: usage?.prompt_tokens ?? 0,
    tokensOut: usage?.completion_tokens ?? 0,
    durationMs: Date.now() - start,
    // Reconstruct the OpenAI response shape so `raw` keeps debugging fidelity
    // after the move to the streaming client (issue #500): the resolved
    // tool_call, usage, and (proxy) backend id rather than just `{ usage }`.
    raw: {
      usage,
      backendId,
      choices: [
        { message: { tool_calls: [{ function: { name: 'submit_reflection', arguments: JSON.stringify(parsed) } }] } },
      ],
    },
  };
}