import { logger } from '../../logger.js'; import { getDefaultProviderRetryConfig } from '../../config.js'; import { OpenAICompatClient, type LLMEvent, type Message, type ToolDef } from '../../llm/openai-compat.js'; import type { ReflectionResult } from './types.js'; import { REFLECTION_TOOL_SCHEMA } from './reflection-schema.js'; export interface ReflectionLlmConfig { endpoint: string; model: string | undefined; apiKey?: string; /** True when the reflection worker routes through the AAO Gateway (proxy). */ proxy?: boolean; /** Reflection target user — recorded as the usage owner. */ userId?: string; /** * Model context window in tokens. Passed to the shared client's * prompt-size preflight guard. Reflection prompts can be large (uncapped * memory snapshot), so use the worker's real limit rather than the * client's conservative 32k default, which would block valid prompts. */ contextLimitTokens?: number; } export interface ReflectionLlmResult { parsed: ReflectionResult; tokensIn: number; tokensOut: number; durationMs: number; raw: unknown; } /** Total attempts (1 initial + retries) for resample-worthy failures. */ const MAX_ATTEMPTS = 3; /** Backoff before attempt 2 and 3. Injectable for tests. */ const RETRY_DELAYS_MS = [500, 1500]; let sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); /** Test hook: replace the backoff sleeper (avoids real timers in vitest). */ export function setReflectionRetrySleep(fn: (ms: number) => Promise): void { sleep = fn; } /** * Errors worth a resample: small reflection models occasionally emit * malformed tool-call markup, which strict backends (e.g. llama-server's * tool parser) reject with a 5xx like * "Failed to parse input at pos 41: ...". The sampling is * stochastic (temperature 0.2), so simply asking again usually succeeds. * 4xx (bad key, bad request shape) is deterministic config error — fail fast. */ class RetryableLlmError extends Error {} export async function callReflectionLlm( cfg: ReflectionLlmConfig, systemPrompt: string, userPrompt: string ): Promise { const start = Date.now(); let lastErr: Error | null = null; for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { try { return await callOnce(cfg, systemPrompt, userPrompt, start); } catch (e) { if (!(e instanceof RetryableLlmError)) throw e; lastErr = e; if (attempt < MAX_ATTEMPTS) { const delay = RETRY_DELAYS_MS[attempt - 1] ?? 1500; logger.warn( `[reflection-llm] attempt ${attempt}/${MAX_ATTEMPTS} failed (${e.message.slice(0, 200)}); retrying in ${delay}ms`, ); await sleep(delay); } } } throw lastErr ?? new Error('reflection LLM failed'); } /** * Classify an OpenAICompatClient error for the reflection resample loop. * - HTTP 5xx (incl. tool-call parse errors on malformed model output) and * gateway_shutdown / gateway_timeout: transient → resample. * - HTTP 4xx (bad key / request shape), budget_exhausted / rate_limited * (won't pass until the period resets), and the client-side * "blocked before send" prompt-size guard: deterministic → fail fast. * - Everything else (transport / parse / idle timeout): stochastic → resample. */ function classifyClientError(message: string, gatewayErrorType?: string): Error { if (gatewayErrorType === 'budget_exhausted' || gatewayErrorType === 'rate_limited') { return new Error(message); } if (gatewayErrorType === 'gateway_shutdown' || gatewayErrorType === 'gateway_timeout') { return new RetryableLlmError(message); } // Client-side preflight rejection — the prompt is too large; resampling the // identical prompt cannot help. if (message.includes('blocked before send')) { return new Error(message); } const m = /HTTP (\d{3})/.exec(message); if (m) { const status = Number(m[1]); if (status >= 500) return new RetryableLlmError(message); return new Error(message); } return new RetryableLlmError(message); } async function callOnce( cfg: ReflectionLlmConfig, systemPrompt: string, userPrompt: string, start: number, ): Promise { // Route through the shared client so usage lands in the single // per-user ledger (gateway + direct) like every other LLM call. // maxAttempts=1: the outer callReflectionLlm loop owns resampling. const client = new OpenAICompatClient( cfg.endpoint, cfg.model, cfg.apiKey, { ...getDefaultProviderRetryConfig(), maxAttempts: 1 }, undefined, cfg.contextLimitTokens, // real model window; avoid the 32k default blocking large reflection prompts undefined, undefined, { proxy: cfg.proxy === true }, ); const messages: Message[] = [ { role: 'system', content: systemPrompt }, { role: 'user', content: userPrompt }, ]; let parsed: ReflectionResult | null = null; let usage: { prompt_tokens: number; completion_tokens: number } | undefined; let errorMsg: string | null = null; let errorGatewayType: string | undefined; let backendId: string | undefined; for await (const event of client.chat( messages, [REFLECTION_TOOL_SCHEMA as unknown as ToolDef], undefined, { userId: cfg.userId }, { temperature: 0.2, toolChoice: { type: 'function', function: { name: 'submit_reflection' } } }, ) as AsyncGenerator) { if (event.type === 'tool_use') { if (event.name === 'submit_reflection' && parsed === null) { parsed = event.input as unknown as ReflectionResult; } } else if (event.type === 'done') { usage = event.usage; } else if (event.type === 'backend') { backendId = event.backendId; } else if (event.type === 'error') { errorMsg = event.error; errorGatewayType = event.gatewayErrorType; } } if (errorMsg !== null) { throw classifyClientError(`reflection LLM ${errorMsg}`, errorGatewayType); } if (parsed === null) { throw new RetryableLlmError('reflection LLM returned no submit_reflection tool_call'); } // The shared client swallows tool-argument JSON parse errors and yields an // empty `{}` input. Preserve the old resample-on-malformed behaviour with a // shallow structural check against the tool schema's required fields — a // genuinely-empty object means the model emitted broken tool markup. const p = parsed as unknown as Record; if (p['piece_changes'] === undefined || p['reasoning'] === undefined) { throw new RetryableLlmError('reflection LLM tool_call arguments were malformed or incomplete'); } return { parsed, tokensIn: usage?.prompt_tokens ?? 0, tokensOut: usage?.completion_tokens ?? 0, durationMs: Date.now() - start, // Reconstruct the OpenAI response shape so `raw` keeps debugging fidelity // after the move to the streaming client (issue #500): the resolved // tool_call, usage, and (proxy) backend id rather than just `{ usage }`. raw: { usage, backendId, choices: [ { message: { tool_calls: [{ function: { name: 'submit_reflection', arguments: JSON.stringify(parsed) } }] } }, ], }, }; }