import type { Message, ToolDef, ToolCall, OpenAICompatClient, LLMEvent, } from '../llm/openai-compat.js'; import { logger } from '../logger.js'; import { stripThinkingTokens } from './strip-thinking.js'; const ISOLATED_TOOL_USE_ERROR = (name: string) => `Isolated LLM call unexpectedly requested tool "${name}"`; /** * Run an isolated text-only LLM call: no tools, no callbacks, no state. * Used by the prompt-guard summarization stage and by buildContextOverflowResult * to produce a last-resort handoff summary. * * Throws if the LLM tries to invoke a tool or returns an error event. */ export async function runIsolatedLlm( client: OpenAICompatClient, messages: Message[], cancelSignal?: AbortSignal, ): Promise { let output = ''; for await (const event of client.chat(messages, undefined, cancelSignal)) { if (event.type === 'text') { output += event.text; continue; } if (event.type === 'tool_use') { throw new Error(ISOLATED_TOOL_USE_ERROR(event.name)); } if (event.type === 'error') { throw new Error(event.error); } } return stripThinkingTokens(output); } export interface PromptProgress { processed: number; total: number; timeMs: number; cache: number; } export interface ConsumeStreamCallbacks { onText?: (text: string) => void; onToolUse?: (name: string, input: Record, callId: string) => void; /** * Fired per streaming tool-call argument chunk (before the aggregated * onToolUse). Used to render live tool content. Does NOT affect * pendingToolCalls — the final tool_use still builds those. */ onToolCallDelta?: (index: number, callId: string, name: string, chunk: string) => void; onPromptProgress?: (progress: PromptProgress) => void; /** * Fired at most once per LLM call when the OpenAICompatClient is in proxy * mode and the response surfaced a backend identity header * (e.g. `x-litellm-model-id`). The runner uses this to attribute the * call to a specific physical backend behind the proxy, so the UI * can render the matching Pet / NodeStatus. * * cacheKey is non-null only on LiteLLM cache hits (`x-litellm-cache-key`). */ onBackend?: (backendId: string, cacheKey: string | null) => void; } export interface ConsumedLLMResponse { accumulatedText: string; pendingToolCalls: ToolCall[]; hadError: boolean; errorMessage: string; lastUsage?: { prompt_tokens: number; completion_tokens: number }; /** * The physical backend id that handled this call, set when the * client is proxy-mode and the proxy reported one. Null for direct * (non-proxy) workers, or proxy responses missing the header. */ backendId?: string; /** LiteLLM cache key when this response was a cache hit; null otherwise. */ backendCacheKey?: string | null; } /** * Consume one LLM response stream end-to-end with an idle-timeout safety net. * * - Resets the per-event timeout on every chunk so a long-running but actively * streaming response is allowed; only true silence past `idleTimeoutMs` * trips the abort. * - On timeout or stream error, ensures the underlying generator is returned * (with a 5s safety cap on `return()` itself, since some generators hang). * - Strips thinking-token blocks (DeepSeek/Qwen/Gemma flavors) from the * accumulated text before returning. * * Pure I/O — no movement state. Caller is responsible for translating the * returned tool calls into actions and feeding `onToolUse`/`onText` events * to its callback bridge. */ export async function consumeLlmStream( client: OpenAICompatClient, messages: Message[], tools: ToolDef[], cancelSignal: AbortSignal | undefined, idleTimeoutMs: number, callbacks: ConsumeStreamCallbacks = {}, contextLabel: string = '', ): Promise { const stream = client.chat(messages, tools, cancelSignal); const accumulator: ConsumedLLMResponse = { accumulatedText: '', pendingToolCalls: [], hadError: false, errorMessage: '', }; let streamExhausted = false; try { while (!streamExhausted) { const nextPromise = stream.next(); const result = await Promise.race([ nextPromise, new Promise((_, reject) => { const id = setTimeout(() => reject(new Error('LLM stream idle safety timeout')), idleTimeoutMs); // Clear the timer when the underlying chunk resolves so we don't leak it. void nextPromise.then(() => clearTimeout(id), () => clearTimeout(id)); }), ]); if (result.done) { streamExhausted = true; break; } handleEvent(result.value, accumulator, callbacks, contextLabel); } } catch (safetyErr) { const msg = safetyErr instanceof Error ? safetyErr.message : String(safetyErr); logger.error(`[llm-stream] ${contextLabel}stream safety timeout or error: ${msg}`); accumulator.hadError = true; accumulator.errorMessage = msg; try { await Promise.race([ stream.return(undefined as never), new Promise((resolve) => setTimeout(resolve, 5_000)), ]); } catch { /* swallow — best-effort cleanup */ } } accumulator.accumulatedText = stripThinkingTokens(accumulator.accumulatedText); return accumulator; } function handleEvent( event: LLMEvent, acc: ConsumedLLMResponse, callbacks: ConsumeStreamCallbacks, contextLabel: string, ): void { switch (event.type) { case 'text': acc.accumulatedText += event.text; callbacks.onText?.(event.text); return; case 'tool_use': acc.pendingToolCalls.push({ id: event.id, type: 'function', function: { name: event.name, arguments: JSON.stringify(event.input), }, }); callbacks.onToolUse?.(event.name, event.input, event.id); logger.info(`[llm-stream] ${contextLabel}tool_use: ${event.name} args=${JSON.stringify(event.input).substring(0, 300)}`); return; case 'tool_use_delta': callbacks.onToolCallDelta?.(event.index, event.callId, event.name, event.chunk); return; case 'done': if (event.usage) acc.lastUsage = event.usage; return; case 'error': acc.hadError = true; acc.errorMessage = event.error; logger.error(`[llm-stream] ${contextLabel}LLM error: ${event.error}`); return; case 'backend': acc.backendId = event.backendId; acc.backendCacheKey = event.cacheKey; callbacks.onBackend?.(event.backendId, event.cacheKey); logger.info(`[llm-stream] ${contextLabel}proxy backend resolved: id=${event.backendId} cache=${event.cacheKey ?? 'miss'}`); return; case 'prompt_progress': callbacks.onPromptProgress?.({ processed: event.processed, total: event.total, timeMs: event.timeMs, cache: event.cache, }); return; } }