maestro/src/engine/tools/browser.ts
2026-06-03 05:08:00 +00:00

1831 lines
75 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { ToolDef } from '../../llm/openai-compat.js';
import type { ToolContext, ToolResult } from './core.js';
import { resolveOutputPathWithin } from './core.js';
import { logger } from '../../logger.js';
import { recorder } from '../browser-recorder.js';
import type { RecordedAction, FrameChainEntry } from '../browser-recorder.js';
import * as dns from 'dns';
import * as path from 'path';
import { existsSync, mkdirSync, appendFileSync, statSync } from 'fs';
import { fileURLToPath, pathToFileURL } from 'url';
import type { Browser, BrowserContext, Download, Frame, Locator, Page } from 'playwright';
import { SessionManager, type BrowserSession, CAPTCHA_POOL_SESSION_ID } from '../browser-session.js';
import { loadConfig } from '../../config.js';
import { buildNovncPath } from '../../bridge/novnc-proxy.js';
import { checkSSRF, isPrivateIPv4, isPrivateIPv6, isHostAllowed } from './shared/ssrf.js';
import { htmlToText } from './shared/html.js';
import { detectAuthExpiry } from '../browser-session-expiry.js';
export { detectAuthExpiry as runAuthCheck } from '../browser-session-expiry.js';
/**
* After navigation, check if the page indicates the auth session has expired
* (login URL redirect or logged-in selector missing). Returns the reason
* string if expired (and notifies ctx.onAuthExpired), or null otherwise.
*
* Skipped entirely when no browser session profile is bound to the job.
*/
async function checkAuthExpiry(page: Page, ctx: ToolContext): Promise<string | null> {
if (!ctx.browserSessionProfileId || !ctx.browserSessionProfile) return null;
const profile = ctx.browserSessionProfile;
const present = profile.loggedInSelector
? !!(await page.$(profile.loggedInSelector).catch(() => null))
: true;
// statusCode is hard to get reliably after waitFor; we rely on URL pattern +
// selector for the heuristic. 200 is a placeholder so detectAuthExpiry doesn't
// 401-flag.
const verdict = detectAuthExpiry({
profile,
finalUrl: page.url(),
statusCode: 200,
loggedInSelectorPresent: present,
});
if (verdict.expired) {
ctx.onAuthExpired?.(ctx.browserSessionProfileId, verdict.reason);
return verdict.reason;
}
return null;
}
/**
* Check if a URL targets a private/internal address.
* Returns an error message if blocked, or null if allowed.
*/
function isPathWithin(parent: string, child: string): boolean {
const relative = path.relative(path.resolve(parent), path.resolve(child));
return relative === '' || (!relative.startsWith('..') && !path.isAbsolute(relative));
}
function resolveWorkspaceFileUrl(parsed: URL, workspacePath: string): { filePath: string; url: string } | { error: string } {
let filePath: string;
try {
filePath = fileURLToPath(parsed);
} catch (e) {
return { error: `Invalid file URL: ${(e as Error).message}` };
}
const workspaceRoot = path.resolve(workspacePath);
// Backwards-compat: `file:///workspace/...` was previously documented as a
// virtual workspace root. The docs no longer advertise it, but in-flight
// jobs and LLMs trained on the old convention may still emit it. We
// silently remap and log a deprecation warning so the bad pattern is
// observable in logs.
const virtualWorkspaceRoot = path.resolve('/workspace');
if (!isPathWithin(workspaceRoot, filePath) && isPathWithin(virtualWorkspaceRoot, filePath)) {
logger.warn(`[BrowseWeb] deprecated /workspace virtual path used: ${filePath}. Use a workspace-relative path instead (e.g., "output/foo.html").`);
filePath = path.resolve(workspaceRoot, path.relative(virtualWorkspaceRoot, filePath));
}
if (!isPathWithin(workspacePath, filePath)) {
return { error: `file:// URL is only allowed within workspace: ${workspacePath}` };
}
const normalized = new URL(pathToFileURL(filePath).href);
normalized.search = parsed.search;
normalized.hash = parsed.hash;
return { filePath, url: normalized.href };
}
function validateFileUrlAccess(parsed: URL, workspacePath: string): string | null {
const resolved = resolveWorkspaceFileUrl(parsed, workspacePath);
return 'error' in resolved ? resolved.error : null;
}
// Detects strings that look like workspace-relative paths rather than URLs.
// We accept inputs without a scheme so the LLM can pass `output/viewer.html`
// directly (the recommended form). Strings starting with `//` (protocol-
// relative URL) or containing `://` are left to the URL parser.
//
// To avoid swallowing genuinely malformed URLs like "example.com" or
// "not-a-url", we require the input either to contain a `/` (path
// separator) or to start with `./` / `../`. A bare token with no slash
// is left to the URL parser, which will reject it as "Invalid URL".
function looksLikeWorkspaceRelativePath(url: string): boolean {
if (!url) return false;
if (url.startsWith('//')) return false;
// Has a URL scheme like "http:" / "https:" / "file:" / "about:" / "data:"
if (/^[a-zA-Z][a-zA-Z0-9+\-.]*:/.test(url)) return false;
if (url.startsWith('./') || url.startsWith('../')) return true;
return url.includes('/');
}
export function normalizeFileUrlForWorkspace(url: string, workspacePath: string): { url: string } | { error: string } {
// Workspace-relative path (recommended form): e.g., "output/viewer.html".
// Resolved against the actual workspace root and converted to file:// URL.
if (looksLikeWorkspaceRelativePath(url)) {
if (path.isAbsolute(url)) {
return { error: `BrowseWeb URL "${url}" は workspace 外の絶対パスです。workspace ルートからの相対パス (例: "output/foo.html") または完全な URL (https://...) を使ってください。` };
}
const resolved = path.resolve(workspacePath, url);
if (!isPathWithin(workspacePath, resolved)) {
return { error: `Path "${url}" is outside workspace: ${workspacePath}` };
}
return { url: pathToFileURL(resolved).href };
}
let parsed: URL;
try {
parsed = new URL(url);
} catch {
return { error: `Invalid URL: "${url}"` };
}
if (parsed.protocol !== 'file:') return { url };
const resolved = resolveWorkspaceFileUrl(parsed, workspacePath);
if ('error' in resolved) return { error: resolved.error };
return { url: resolved.url };
}
async function ssrfCheck(url: string, allowedHosts: string[], workspacePath: string): Promise<string | null> {
let parsed: URL;
try {
parsed = new URL(url);
} catch {
return `Invalid URL: "${url}"`;
}
if (parsed.protocol === 'file:') {
return validateFileUrlAccess(parsed, workspacePath);
}
try {
await checkSSRF(parsed.hostname, allowedHosts);
} catch (e) {
return (e as Error).message;
}
return null;
}
// --- Tool definitions ---
interface BrowseWebAction {
type: 'goto' | 'click' | 'fill' | 'screenshot' | 'getText' | 'wait' | 'dumpHtml';
selector?: string;
ref?: string;
value?: string;
url?: string;
ms?: number;
/** dumpHtml: 包含する子孫の階層数。デフォルト 3 */
depth?: number;
/** dumpHtml: 戻り値プレビュー長 (ファイルにはフル保存)。デフォルト 5000 */
maxChars?: number;
}
const BROWSEWEB_DEF: ToolDef = {
type: 'function',
function: {
name: 'BrowseWeb',
description:
'ヘッドレスブラウザでWebページを操作する。同一ジョブ内ではセッションCookie・ログイン状態等が維持される。\n' +
'基本モード: url を指定してページのテキストを取得。screenshot でスクリーンショットも保存可能。\n' +
'アクションモード: actions 配列で goto/click/fill/screenshot/getText/wait/dumpHtml を連続実行。\n' +
'出力には操作可能要素が {e1 button "..."} 形式の ref 注釈付きで埋め込まれ、click/fill で ref を直接指定できる。<div role="button"> 等の ARIA ベース要素・addEventListener で click handler が後付けされた要素・open shadow DOM・iframe (cross-origin 含む) の中身も検出される。iframe 内の ref は {f1.e3 ...} のように frame ID で prefix される。状態属性 (expanded/checked/selected/pressed/disabled/haspopup) は注釈末尾に列挙。\n' +
'ref で当たらない or 構造を直接見たいときは dumpHtml アクションで該当要素の outerHTML を取得できる(脱出口)。\n' +
'click が繰り返し空振り / ログイン or CAPTCHA / ドラッグ&ドロップや canvas など DOM では操作できない UI に当たったら、InteractiveBrowse でユーザーに noVNC 経由で手動操作してもらい、その後 BrowseWithSession で続きを引き継げる。\n' +
'ページから発生したファイルダウンロードは自動的に output/ に保存され、戻り値末尾に [download] saved output/<filename> として通知される。\n' +
'詳細な使い方・ワークフロー例は ReadToolDoc({ name: "BrowseWeb" }) で取得可能。',
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: '取得する URLアクションモード時は省略可。ローカルファイルを開く場合は workspace ルートからの相対パスを指定 (例: "output/viewer.html")' },
waitFor: {
type: 'string',
description: '待機する CSS セレクタ(省略時は load イベント完了まで待機)',
},
extractSelector: {
type: 'string',
description: '特定要素のテキストだけ抽出する CSS セレクタ(省略時はページ全体)',
},
screenshot: {
type: 'string',
description: 'スクリーンショットを保存するファイル名(例: "page.png"。output/ に保存される',
},
actions: {
type: 'array',
description: '実行するアクションの配列(指定時は基本モードのパラメータは無視される)',
items: {
type: 'object',
properties: {
type: {
type: 'string',
enum: ['goto', 'click', 'fill', 'screenshot', 'getText', 'wait', 'dumpHtml'],
description: 'アクション種別',
},
selector: { type: 'string', description: 'CSS セレクタ (click, fill, getText, dumpHtml) — ref があれば不要' },
ref: { type: 'string', description: '前回スナップショットで割り振られた要素 ref (e1, e2, ...) — click/fill/dumpHtml で selector の代わりに使える' },
value: { type: 'string', description: '入力値 (fill) またはファイル名 (screenshot)' },
url: { type: 'string', description: 'URL (goto)。ローカルファイルを開く場合は workspace ルートからの相対パス (例: "output/viewer.html")' },
ms: { type: 'number', description: '待機ミリ秒 (wait)' },
depth: { type: 'number', description: 'dumpHtml: 包含する子孫の階層数 (デフォルト 3)' },
maxChars: { type: 'number', description: 'dumpHtml: 戻り値プレビュー長 (デフォルト 5000)。フル HTML は logs/browse/ に保存' },
},
required: ['type'],
},
},
timeout: {
type: 'number',
description: 'タイムアウト(ミリ秒、デフォルト: 60000',
},
recordTo: {
type: 'string',
description:
'省略すると記録しない。指定すると、本ジョブで成功した各アクションを buffer に記録し、タスク終了時に data/users/{userId}/recordings/{recordTo}.json として保存するTask 3.5 の Save as Script で使う)。',
},
},
},
},
};
// --- Session manager (injected from server.ts) ---
let _sessionManager: SessionManager | null = null;
/** server.ts から SessionManager を注入する */
export function setSessionManager(sm: SessionManager | null): void {
_sessionManager = sm;
}
function getSessionManager(): SessionManager | null {
return _sessionManager;
}
export { getSessionManager };
// --- Browser lifecycle ---
//
// 2026-05 redesign: 1 つの Browser を共有していたものを、3 種類に分離した:
// 1. CAPTCHA Pool (kind='pool'): admin が CAPTCHA を解く共有 noVNC session。
// WebSearch / WebFetch スクショなどタスク横断の処理が使う。
// 固定 sessionId `__captcha_pool__`。
// 2. Task Session (kind='task'): タスクごとに分離された noVNC session。
// BrowseWeb / InteractiveBrowse が ctx.taskId をキーに取得・再利用する。
// タスク visibility に基づき認可される。LRU 退避 + idle GC 対象。
// 3. Headless 共有 (skip mode): config の captchaSolve != 'novnc' の場合、
// または noVNC が立ち上げられない fallback 経路で使う single Browser。
let _headlessBrowser: Browser | null = null;
let _headlessInitPromise: Promise<Browser> | null = null;
let _browserUnavailable: string | null = null;
/** Headless 共有 Browser を取得する。skip モード / noVNC fallback 用 */
async function getHeadlessBrowser(): Promise<Browser> {
if (_browserUnavailable) throw new Error(_browserUnavailable);
if (_headlessBrowser?.isConnected()) return _headlessBrowser;
if (_headlessInitPromise) return _headlessInitPromise;
_headlessInitPromise = (async () => {
const { chromium } = await import('playwright');
const { buildLaunchOptions } = await import('../browser-launch.js');
_headlessBrowser = await chromium.launch(buildLaunchOptions(loadConfig().browser, true));
logger.debug('[browser] launched chromium headless');
return _headlessBrowser;
})().catch((e) => {
const msg = (e as Error).message ?? String(e);
if (msg.includes("Executable doesn't exist") || msg.includes('browserType.launch')) {
_browserUnavailable = `Playwright browser unavailable: ${msg}`;
logger.warn(`[browser] ${_browserUnavailable}`);
}
throw e;
}).finally(() => { _headlessInitPromise = null; });
return _headlessInitPromise;
}
/**
* CAPTCHA Pool の Browser を取得する。
* - noVNC モード: SessionManager.createPoolSession() で立ち上げる
* - skip モード or 立ち上げ失敗時: headless 共有 Browser に fallback
*
* Pool は WebSearch / WebFetch スクショなど "タスク横断で同じ Cookie / 認証
* を使いまわしたい" 処理が使う。admin が noVNC で CAPTCHA を手動解決すると
* Cookie が Pool 配下のコンテキストに残り、後続 WebSearch がそのまま使える。
*/
export async function getCaptchaPoolBrowser(): Promise<Browser> {
const config = loadConfig();
if (config.browser?.captchaSolve === 'novnc') {
const sm = getSessionManager();
if (sm) {
try {
const pool = await sm.createPoolSession();
if (pool.browser?.isConnected()) return pool.browser;
logger.warn('[browser] CAPTCHA pool browser disconnected, falling back to headless');
} catch (e) {
logger.warn(`[browser] CAPTCHA pool creation failed: ${e}, falling back to headless`);
}
} else {
logger.warn('[browser] noVNC deps missing (Xvfb/x11vnc/websockify), falling back to headless');
}
}
return getHeadlessBrowser();
}
/**
* 指定タスク用の Browser を取得する。
* - noVNC モード + ctx.taskId あり: SessionManager.getOrCreateTaskSession() で立ち上げる
* - それ以外: headless 共有 Browser に fallback
*
* Task Session は BrowseWeb / InteractiveBrowse が使う。同じ taskId への
* 連続呼び出しは同じ Browser を再利用するので Cookie / ログイン状態が維持される。
*/
export async function getTaskSessionBrowser(ctx: ToolContext): Promise<Browser> {
const config = loadConfig();
if (config.browser?.captchaSolve === 'novnc' && ctx.taskId) {
const sm = getSessionManager();
if (sm) {
try {
const session = await sm.getOrCreateTaskSession(ctx.taskId, ctx.userId);
sm.touchSession(session.id);
if (session.browser?.isConnected()) return session.browser;
logger.warn(`[browser] task session ${ctx.taskId} browser disconnected, falling back to headless`);
} catch (e) {
logger.warn(`[browser] task session ${ctx.taskId} creation failed: ${e}, falling back to headless`);
}
} else {
logger.warn('[browser] noVNC deps missing, falling back to headless for task session');
}
}
return getHeadlessBrowser();
}
/** UI が CAPTCHA Pool の noVNC パスを取得するためのヘルパー (admin only) */
export function getCaptchaPoolInfo(): {
sessionId: string;
novncPath: string;
display: string;
captchaPending: boolean;
} | null {
const sm = getSessionManager();
if (!sm) return null;
const pool = sm.getSession(CAPTCHA_POOL_SESSION_ID);
if (!pool) return null;
return {
sessionId: pool.id,
novncPath: buildNovncPath(pool.id),
display: pool.display,
captchaPending: pool.captchaPending === true,
};
}
/** UI が指定 taskId の Task Session の noVNC パスを取得するためのヘルパー */
export function getTaskSessionInfo(taskId: string): {
sessionId: string;
novncPath: string;
display: string;
} | null {
const sm = getSessionManager();
if (!sm) return null;
for (const s of sm.listSessions()) {
if (s.kind === 'task' && s.taskId === taskId) {
return { sessionId: s.id, novncPath: buildNovncPath(s.id), display: s.display };
}
}
return null;
}
export async function closeBrowser(): Promise<void> {
try {
const webModule = await import('./web.js') as unknown as { clearPersistentContexts?: () => void };
webModule.clearPersistentContexts?.();
} catch {
// web.js が未ロードの場合は無視
}
// headless 用ジョブコンテキストを全て閉じる
for (const [key, jobCtx] of _jobContexts) {
await jobCtx.close().catch(() => {});
_jobContexts.delete(key);
}
// 全 noVNC session (pool + task) を destroy
const sm = getSessionManager();
if (sm) {
await sm.destroyAll().catch(() => {});
}
if (_headlessBrowser) {
try {
await _headlessBrowser.close();
} catch {
// ignore cleanup errors
}
_headlessBrowser = null;
logger.debug('[browser] headless closed');
}
}
// --- Per-job persistent browser context ---
/**
* Headless モード時の per-workspace BrowserContext。
* noVNC モードのときは task session 自身が context を持つので、ここには
* エントリが入らない (session.context を直接使う)。
*/
const _jobContexts = new Map<string, BrowserContext>();
const _interceptedPages = new WeakSet<Page>();
const _hookedContexts = new WeakSet<BrowserContext>();
/**
* Per-page ref → (frame, selector) mapping. Main-frame refs are `e1, e2, ...`;
* child-frame refs are prefixed `f1.e1, f1.e2, ...` (`f1` is the first iframe
* encountered in page.frames(), excluding main).
*
* Storing the Frame reference rather than a frame ID lets us dispatch click /
* fill / dumpHtml on the correct execution context (Playwright's Frame has the
* same surface as Page for these methods, and it transparently bridges
* cross-origin iframes via CDP).
*/
type RefTarget = { frame: Frame; selector: string };
const _pageRefs = new WeakMap<Page, Map<string, RefTarget>>();
// --- Download capture (Playwright `page.on('download')`) ---
//
// クリック等で発生したダウンロードを workspace の output/ に保存して、agent から
// Read / ReadPdf / 等で続けて操作できるようにする。各 BrowseWeb / BrowseWithSession
// の戻り値末尾に `[download] saved output/foo.csv (12345 bytes)` を追加する。
export interface BrowserDownloadEntry {
filename: string;
/** workspace 相対パス (例: "output/report.csv") */
savedRelPath: string;
ok: boolean;
bytes?: number;
error?: string;
timestamp: string;
}
const _downloadHookedPages = new WeakSet<Page>();
const _pageDownloads = new WeakMap<Page, BrowserDownloadEntry[]>();
const _pageDownloadPromises = new WeakMap<Page, Set<Promise<void>>>();
/** path traversal や禁則文字を排除して安全な basename にする (export: テスト用) */
export function sanitizeDownloadFilename(name: string | undefined | null): string {
const base = path.basename((name ?? '').toString() || 'download');
const cleaned = base.replace(/[\\/:*?"<>|\s]/g, '_').slice(0, 200);
return cleaned || 'download';
}
/** 衝突したら "foo-1.csv" "foo-2.csv" 形式で空きを探す (export: テスト用) */
export function pickUniqueOutputPath(workspacePath: string, filename: string): string {
const dir = path.join(workspacePath, 'output');
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
const parsed = path.parse(filename);
let candidate = path.join(dir, filename);
for (let n = 1; existsSync(candidate) && n < 1000; n++) {
candidate = path.join(dir, `${parsed.name}-${n}${parsed.ext}`);
}
return candidate;
}
function logBrowserDownload(workspacePath: string, entry: BrowserDownloadEntry): void {
try {
const logsDir = path.join(workspacePath, 'logs');
if (!existsSync(logsDir)) mkdirSync(logsDir, { recursive: true });
appendFileSync(
path.join(logsDir, 'downloads.jsonl'),
JSON.stringify({ ...entry, source: 'BrowseWeb' }) + '\n',
);
} catch (e) {
logger.warn(`[BrowseWeb] failed to write download history: ${(e as Error).message}`);
}
}
function setupDownloadHandler(page: Page, workspacePath: string): void {
if (_downloadHookedPages.has(page)) return;
_downloadHookedPages.add(page);
if (!_pageDownloads.has(page)) _pageDownloads.set(page, []);
if (!_pageDownloadPromises.has(page)) _pageDownloadPromises.set(page, new Set());
page.on('download', (download: Download) => {
const promiseSet = _pageDownloadPromises.get(page)!;
const entries = _pageDownloads.get(page)!;
const filename = sanitizeDownloadFilename(download.suggestedFilename());
const ts = new Date().toISOString();
const handlerPromise = (async () => {
try {
const savePath = pickUniqueOutputPath(workspacePath, filename);
await download.saveAs(savePath);
const bytes = statSync(savePath).size;
const savedRelPath = path.relative(workspacePath, savePath);
const entry: BrowserDownloadEntry = {
filename: path.basename(savePath),
savedRelPath, ok: true, bytes, timestamp: ts,
};
entries.push(entry);
logBrowserDownload(workspacePath, entry);
logger.debug(`[BrowseWeb] downloaded ${savedRelPath} (${bytes} bytes)`);
} catch (e) {
const entry: BrowserDownloadEntry = {
filename, savedRelPath: '', ok: false,
error: (e as Error).message, timestamp: ts,
};
entries.push(entry);
logBrowserDownload(workspacePath, entry);
}
})();
promiseSet.add(handlerPromise);
handlerPromise.finally(() => promiseSet.delete(handlerPromise));
});
}
/** in-flight な download を最大 timeoutMs だけ待ち、完了済みエントリを取り出して queue を空にする */
async function drainDownloads(page: Page, timeoutMs: number = 30_000): Promise<BrowserDownloadEntry[]> {
const promiseSet = _pageDownloadPromises.get(page);
if (promiseSet && promiseSet.size > 0) {
await Promise.race([
Promise.all(Array.from(promiseSet)),
new Promise<void>(r => setTimeout(r, timeoutMs)),
]);
}
const entries = _pageDownloads.get(page) ?? [];
_pageDownloads.set(page, []);
return entries;
}
function formatDownloadLines(entries: BrowserDownloadEntry[]): string {
if (entries.length === 0) return '';
return entries.map(e =>
e.ok
? `[download] saved ${e.savedRelPath} (${e.bytes} bytes)`
: `[download] FAILED ${e.filename}: ${e.error}`,
).join('\n');
}
/**
* Task の BrowserContext を取得する。
* - noVNC + ctx.taskId あり: SessionManager から取った session.context を返す
* (admin / 該当タスク owner が noVNC で見ている画面と同じものを操作する)
* - それ以外: headless 共有 Browser から workspacePath ごとに新規 context を
* 作って _jobContexts に保存
*/
async function getJobContext(
ctx: ToolContext,
): Promise<BrowserContext> {
const config = loadConfig();
if (config.browser?.captchaSolve === 'novnc' && ctx.taskId) {
const sm = getSessionManager();
if (sm) {
try {
const session = await sm.getOrCreateTaskSessionWithState(
ctx.taskId,
ctx.userId,
ctx.browserSessionState ?? null,
ctx.browserSessionProfileId ?? null,
);
sm.touchSession(session.id);
if (session.context) return session.context;
} catch (e) {
logger.warn(`[browser] task session for taskId=${ctx.taskId} unavailable: ${(e as Error).message}, falling back to headless`);
}
}
}
let jobContext = _jobContexts.get(ctx.workspacePath);
if (!jobContext || jobContext.browser() === null) {
const browser = await getHeadlessBrowser();
jobContext = await browser.newContext(
ctx.browserSessionState
? { storageState: ctx.browserSessionState as never }
: {},
);
const { applyStealthInitScript, applyAgentSnapshotHooks } = await import('../browser-launch.js');
await applyStealthInitScript(jobContext);
await applyAgentSnapshotHooks(jobContext);
_jobContexts.set(ctx.workspacePath, jobContext);
}
return jobContext;
}
/** BrowserContext に SSRF インターセプト + ref マップ管理のフックを 1 回だけ装着する */
function ensureContextHooks(
context: BrowserContext,
allowedHosts: string[],
workspacePath: string,
): void {
if (_hookedContexts.has(context)) return;
_hookedContexts.add(context);
context.on('page', (newPage: Page) => {
if (!_interceptedPages.has(newPage)) {
_interceptedPages.add(newPage);
setupRouteInterception(newPage, allowedHosts, workspacePath).catch(() => {});
}
setupDownloadHandler(newPage, workspacePath);
newPage.on('framenavigated', (frame) => {
if (frame === newPage.mainFrame()) _pageRefs.delete(newPage);
});
});
// 既存ページにも装着 (context が既に作られている場合)
for (const existingPage of context.pages()) {
setupDownloadHandler(existingPage, workspacePath);
}
}
/**
* BrowseWeb / InteractiveBrowse が使う Page を取得する。
* 同じ taskId / workspacePath 内ではセッション (Cookie / ログイン状態) が維持される。
*/
async function getJobPage(
ctx: ToolContext,
allowedHosts: string[],
timeout: number,
): Promise<Page> {
const context = await getJobContext(ctx);
ensureContextHooks(context, allowedHosts, ctx.workspacePath);
const pages = context.pages();
if (pages.length > 0) {
const page = pages[pages.length - 1]!;
page.setDefaultTimeout(timeout);
return page;
}
const page = await context.newPage();
page.setDefaultTimeout(timeout);
await setupRouteInterception(page, allowedHosts, ctx.workspacePath);
_interceptedPages.add(page);
page.on('framenavigated', (frame) => {
if (frame === page.mainFrame()) _pageRefs.delete(page);
});
return page;
}
/**
* ページの DOM をスキャンし、表示テキスト + 操作可能要素のリファレンス注釈を返す。
* 注釈は {e1 button "ログイン"} のような形式で本文中に埋め込まれる。
* 各 ref は Playwright が解釈できるセレクタ属性ベース優先、fallback で
* XPath 風 nth-of-typeにマップされる。
*
* 検出する操作可能要素("div クリック" 系を取り逃さないための拡張):
* - 標準タグ: A, BUTTON, INPUT, SELECT, TEXTAREA, LABEL, SUMMARY, DETAILS, OPTION
* - ARIA role: button, link, menuitem, menuitemcheckbox/radio, tab, option,
* checkbox, radio, switch, combobox, listbox, slider, spinbutton,
* textbox, searchbox, treeitem
* - [onclick] / [tabindex >= 0] / [contenteditable=true]
*
* 状態属性 (aria-expanded / aria-checked / aria-selected / aria-pressed /
* aria-disabled / aria-haspopup) は注釈末尾に列挙される。
*
* Open shadow DOM も走査する。IFRAME と <svg> 内部は走査しない(前者は別
* frame、後者はイズが多い。<svg role="button"> 自体は親側で interactive
* 検出される)。
*
* page.evaluate に渡す関数はブラウザ側で実行されるため、DOM API を使用する。
* Node サイドの tsconfig には DOM 型がないため、関数を文字列として渡す。
*/
const SNAPSHOT_SCRIPT = `(() => {
// IFRAME is intentionally NOT in HARD_SKIP — encountering an <iframe> emits
// an inline placeholder so the agent sees where the frame sits. The actual
// frame contents are walked separately on the Node side via Playwright's
// page.frames() API and merged into the final snapshot.
const HARD_SKIP_TAGS = new Set(['SCRIPT', 'STYLE', 'NOSCRIPT', 'TEMPLATE', 'SVG']);
const INTERACTIVE_TAGS = new Set(['A', 'BUTTON', 'INPUT', 'SELECT', 'TEXTAREA', 'LABEL', 'SUMMARY', 'DETAILS', 'OPTION']);
const INTERACTIVE_ROLES = new Set([
'button', 'link', 'menuitem', 'menuitemcheckbox', 'menuitemradio',
'tab', 'option', 'checkbox', 'radio', 'switch', 'combobox',
'listbox', 'slider', 'spinbutton', 'textbox', 'searchbox', 'treeitem'
]);
const SIMPLE_LABEL_TAGS = new Set(['A', 'BUTTON', 'INPUT', 'SELECT', 'OPTION', 'SUMMARY']);
const SIMPLE_LABEL_ROLES = new Set(['button', 'link', 'menuitem', 'option', 'tab', 'checkbox', 'radio', 'switch']);
const BLOCK_TAGS = new Set([
'DIV', 'P', 'SECTION', 'ARTICLE', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
'LI', 'TR', 'BR', 'BLOCKQUOTE', 'PRE', 'NAV', 'HEADER', 'FOOTER',
'ASIDE', 'MAIN', 'FORM', 'FIELDSET', 'TABLE', 'UL', 'OL', 'DL', 'DT', 'DD'
]);
const refs = [];
const lines = [];
let counter = 0;
// ── Selector generation ───────────────────────────────────
// Quote-free attribute values only — if a value contains " or \\, skip
// that selector and fall back to the next strategy. Avoids escaping
// nightmares in CSS attribute selectors.
function safeAttr(s) { return typeof s === 'string' && s.length > 0 && !/["\\\\]/.test(s); }
function uniqueIn(root, sel) {
try { return root.querySelectorAll(sel).length === 1; } catch (e) { return false; }
}
function buildSelector(el) {
const root = el.getRootNode();
const tagLower = el.tagName.toLowerCase();
const candidates = [];
const tid = el.getAttribute('data-testid');
if (safeAttr(tid)) candidates.push('[data-testid="' + tid + '"]');
const tid2 = el.getAttribute('data-test');
if (safeAttr(tid2)) candidates.push('[data-test="' + tid2 + '"]');
const tidQa = el.getAttribute('data-qa');
if (safeAttr(tidQa)) candidates.push('[data-qa="' + tidQa + '"]');
const id = el.id;
if (id && /^[A-Za-z][\\w-]*$/.test(id)) candidates.push('#' + id);
const name = el.getAttribute('name');
if (safeAttr(name) && (tagLower === 'input' || tagLower === 'select' || tagLower === 'textarea')) {
candidates.push(tagLower + '[name="' + name + '"]');
}
const aria = el.getAttribute('aria-label');
if (safeAttr(aria) && aria.length < 80) candidates.push('[aria-label="' + aria + '"]');
for (const sel of candidates) {
if (uniqueIn(root, sel)) return sel;
}
// Fallback: nth-of-type CSS chain. Only resolves in the document tree
// (not across shadow boundaries). For shadow DOM elements without a
// unique attribute, this fallback won't pierce — caller should use
// dumpHtml then a more specific selector.
const parts = [];
let cur = el;
while (cur && cur.nodeType === 1 && cur !== document.body && cur.tagName !== 'HTML') {
let index = 1;
let sib = cur.previousElementSibling;
while (sib) {
if (sib.tagName === cur.tagName) index++;
sib = sib.previousElementSibling;
}
parts.unshift(cur.tagName.toLowerCase() + ':nth-of-type(' + index + ')');
cur = cur.parentElement;
}
return 'body > ' + parts.join(' > ');
}
// ── Visibility & interactivity ────────────────────────────
function isHidden(el) {
if (el.getAttribute && el.getAttribute('aria-hidden') === 'true') return true;
if (el.hasAttribute && el.hasAttribute('hidden')) return true;
let cur = el;
while (cur && cur.hasAttribute) {
if (cur.hasAttribute('inert')) return true;
cur = cur.parentElement;
}
return false;
}
function isVisible(el) {
const style = window.getComputedStyle(el);
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') return false;
const rect = el.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
}
function isContenteditable(el) {
const v = el.getAttribute && el.getAttribute('contenteditable');
return v === 'true' || v === '';
}
function isInteractive(el) {
if (INTERACTIVE_TAGS.has(el.tagName)) return true;
const role = el.getAttribute && el.getAttribute('role');
if (role && INTERACTIVE_ROLES.has(role)) return true;
if (el.hasAttribute && el.hasAttribute('onclick')) return true;
// Tagged by browser-launch.applyAgentSnapshotHooks when JS code calls
// addEventListener('click'|'mousedown'|'pointerdown', ...). Captures the
// "naked clickable <div>" pattern common in jQuery / vanilla / Vue apps.
if (el.hasAttribute && el.hasAttribute('data-ao-click')) return true;
const tabindex = el.getAttribute && el.getAttribute('tabindex');
if (tabindex !== null && tabindex !== undefined && parseInt(tabindex, 10) >= 0) return true;
if (isContenteditable(el)) return true;
return false;
}
// ── Description ───────────────────────────────────────────
function elementName(el) {
const aria = el.getAttribute && el.getAttribute('aria-label');
if (aria) return aria.trim().slice(0, 80);
const labelledBy = el.getAttribute && el.getAttribute('aria-labelledby');
if (labelledBy) {
const ids = labelledBy.split(/\\s+/);
const parts = [];
for (const lid of ids) {
const target = document.getElementById(lid);
if (target) parts.push((target.textContent || '').trim());
}
const joined = parts.join(' ').trim();
if (joined) return joined.slice(0, 80);
}
const text = (el.textContent || '').replace(/\\s+/g, ' ').trim();
if (text) return text.slice(0, 80);
const placeholder = el.getAttribute && el.getAttribute('placeholder');
if (placeholder) return placeholder.trim().slice(0, 80);
const title = el.getAttribute && el.getAttribute('title');
if (title) return title.trim().slice(0, 80);
const alt = el.getAttribute && el.getAttribute('alt');
if (alt) return alt.trim().slice(0, 80);
return '';
}
function elementRole(el) {
const explicit = el.getAttribute && el.getAttribute('role');
if (explicit) return explicit;
const tag = el.tagName.toLowerCase();
if (tag === 'a' && el.getAttribute && el.getAttribute('href')) return 'link';
if (tag === 'button') return 'button';
if (tag === 'input') {
const t = (el.getAttribute('type') || 'text').toLowerCase();
if (t === 'checkbox' || t === 'radio') return t;
if (t === 'submit' || t === 'button' || t === 'reset' || t === 'image') return 'button';
if (t === 'search') return 'searchbox';
return 'textbox';
}
if (tag === 'select') return 'combobox';
if (tag === 'textarea') return 'textbox';
if (tag === 'summary') return 'button';
if (tag === 'details') return 'group';
if (tag === 'option') return 'option';
if (isContenteditable(el)) return 'textbox';
return tag;
}
function elementStates(el) {
const states = [];
const expanded = el.getAttribute && el.getAttribute('aria-expanded');
if (expanded === 'true') states.push('expanded');
else if (expanded === 'false') states.push('collapsed');
if (el.tagName === 'DETAILS' && el.open) states.push('expanded');
const pressed = el.getAttribute && el.getAttribute('aria-pressed');
if (pressed === 'true') states.push('pressed');
const selected = el.getAttribute && el.getAttribute('aria-selected');
if (selected === 'true') states.push('selected');
const checked = el.getAttribute && el.getAttribute('aria-checked');
if (checked === 'true') states.push('checked');
else if (checked === 'mixed') states.push('mixed');
if (el.tagName === 'INPUT' && (el.type === 'checkbox' || el.type === 'radio') && el.checked && !states.includes('checked')) {
states.push('checked');
}
if (el.disabled === true || (el.getAttribute && el.getAttribute('aria-disabled') === 'true')) states.push('disabled');
if (el.required === true) states.push('required');
const haspopup = el.getAttribute && el.getAttribute('aria-haspopup');
if (haspopup && haspopup !== 'false') states.push('haspopup');
return states;
}
function describeElement(el, ref) {
const role = elementRole(el);
const name = elementName(el).replace(/"/g, "'");
const states = elementStates(el);
const tag = el.tagName.toLowerCase();
const parts = [ref, role];
if (name) parts.push('"' + name + '"');
if (tag === 'input' || tag === 'textarea') {
const v = el.value || '';
if (v) parts.push('value="' + String(v).slice(0, 30).replace(/"/g, "'") + '"');
}
if (tag === 'a') {
const href = el.getAttribute('href');
if (href) parts.push('href="' + href.slice(0, 60).replace(/"/g, "'") + '"');
}
if (states.length) parts.push(states.join(' '));
return '{' + parts.join(' ') + '}';
}
// ── Walk ─────────────────────────────────────────────────
function walk(node) {
if (node.nodeType === 3) {
const t = (node.textContent || '').replace(/\\s+/g, ' ').trim();
if (t) lines.push(t);
return;
}
if (node.nodeType !== 1) return;
const el = node;
if (HARD_SKIP_TAGS.has(el.tagName)) return;
if (isHidden(el)) return;
if (!isVisible(el)) return;
// <iframe> emits an inline placeholder so the agent sees its position.
// The Node side walks the frame's contents separately and replaces the
// matching placeholder with the merged frame snapshot.
if (el.tagName === 'IFRAME') {
const src = el.getAttribute('src') || '';
const name = el.getAttribute('name') || '';
const title = el.getAttribute('title') || '';
const labelParts = [];
if (name) labelParts.push('name=' + name);
if (title) labelParts.push('title=' + title.slice(0, 60));
if (src) labelParts.push('src=' + src.slice(0, 120));
lines.push('\\n[[IFRAME ' + (labelParts.join(' ') || '?') + ']]\\n');
return;
}
if (BLOCK_TAGS.has(el.tagName)) lines.push('\\n');
if (isInteractive(el)) {
counter++;
const ref = 'e' + counter;
refs.push({ ref: ref, selector: buildSelector(el) });
lines.push(describeElement(el, ref));
const role = el.getAttribute && el.getAttribute('role');
const skip = SIMPLE_LABEL_TAGS.has(el.tagName) || (role && SIMPLE_LABEL_ROLES.has(role));
if (skip) return;
}
for (const child of Array.from(node.childNodes)) walk(child);
if (el.shadowRoot && el.shadowRoot.mode === 'open') {
for (const child of Array.from(el.shadowRoot.childNodes)) walk(child);
}
}
if (document.body) walk(document.body);
let text = lines.join(' ')
.replace(/ \\n/g, '\\n')
.replace(/\\n /g, '\\n')
.replace(/\\n{3,}/g, '\\n\\n')
.trim();
return { text: text, refs: refs };
})()`;
type FrameSnapshotResult = { text: string; refs: Array<{ ref: string; selector: string }> };
const FRAME_SNAPSHOT_TIMEOUT_MS = 5000;
async function evaluateFrameSnapshot(frame: Frame): Promise<FrameSnapshotResult> {
return await Promise.race<FrameSnapshotResult>([
frame.evaluate(SNAPSHOT_SCRIPT) as unknown as Promise<FrameSnapshotResult>,
new Promise<never>((_, reject) =>
setTimeout(() => reject(new Error(`frame snapshot timeout ${FRAME_SNAPSHOT_TIMEOUT_MS}ms`)), FRAME_SNAPSHOT_TIMEOUT_MS),
),
]);
}
/**
* Snapshot a Page including all of its iframes.
*
* Layout: main-frame text first (with `[[IFRAME ...]]` placeholders inline at
* each iframe's position), followed by one section per child frame
* (`--- iframe fN url="..." ---`). Refs in child frames are prefixed `fN.eM`.
*
* Cross-origin frames are walked too: Playwright's Frame.evaluate transparently
* bridges the cross-origin barrier via CDP, so Stripe / OAuth / reCAPTCHA frame
* contents are reachable. Each frame walk is bounded by FRAME_SNAPSHOT_TIMEOUT_MS
* to avoid hanging on stuck frames.
*/
async function snapshotPage(page: Page): Promise<string> {
const refMap = new Map<string, RefTarget>();
const mainFrame = page.mainFrame();
let mainText: string;
try {
const result = await evaluateFrameSnapshot(mainFrame);
mainText = result.text;
for (const r of result.refs) refMap.set(r.ref, { frame: mainFrame, selector: r.selector });
} catch (e) {
mainText = `[main frame snapshot failed: ${(e as Error).message}]`;
}
const sections: string[] = [mainText];
let frameIdx = 0;
for (const frame of page.frames()) {
if (frame === mainFrame) continue;
if (frame.isDetached()) continue;
frameIdx++;
const fid = `f${frameIdx}`;
const fUrl = frame.url();
const fName = frame.name();
const sectionHeader = `--- iframe ${fid} url="${fUrl}"${fName ? ` name="${fName}"` : ''} ---`;
const sectionFooter = `--- end iframe ${fid} ---`;
let frameText: string;
let frameRefs: Array<{ ref: string; selector: string }>;
try {
const result = await evaluateFrameSnapshot(frame);
frameText = result.text;
frameRefs = result.refs;
} catch (e) {
sections.push(`\n${sectionHeader}\n[cannot inspect: ${(e as Error).message}]\n${sectionFooter}`);
continue;
}
for (const r of frameRefs) {
refMap.set(`${fid}.${r.ref}`, { frame, selector: r.selector });
}
const remappedText = frameText.replace(/\{e(\d+)\b/g, `{${fid}.e$1`);
sections.push(`\n${sectionHeader}\n${remappedText || '[empty]'}\n${sectionFooter}`);
}
_pageRefs.set(page, refMap);
return sections.join('\n');
}
/**
* BrowseWeb 系の getText / snapshot 結果が長すぎる場合、フルテキストを
* `<workspace>/logs/browse/{ts}-{hash}.txt` に保存し、戻り値には
* 先頭 BROWSE_TEXT_PREVIEW_CHARS 文字 + 「Read で続きを取得可能」の案内を返す。
* preview 以下の長さなら full をそのまま返す (ファイルは作らない)。
*
* 旧来は 10k/15k 文字でハードカットしていたが、長文ページの本文が
* 失われる問題があったため採用。LLM は必要に応じて
* Read({ file_path, offset, limit }) で読み直せる。
*/
const BROWSE_TEXT_PREVIEW_CHARS = 5000;
async function saveBrowseText(
ctx: { workspacePath: string },
url: string,
text: string,
label?: string,
previewLimit: number = BROWSE_TEXT_PREVIEW_CHARS,
): Promise<string> {
if (text.length <= previewLimit) return text;
const ts = new Date().toISOString().replace(/[:.]/g, '-');
const crypto = await import('crypto');
const hash = crypto.createHash('sha1').update(url + '\n' + (label ?? '')).digest('hex').slice(0, 8);
const dir = path.join(ctx.workspacePath, 'logs', 'browse');
const { mkdirSync, writeFileSync } = await import('fs');
mkdirSync(dir, { recursive: true });
const filename = `${ts}-${hash}.txt`;
const filepath = path.join(dir, filename);
writeFileSync(filepath, text, 'utf-8');
const relPath = path.posix.join('logs', 'browse', filename);
const preview = text.slice(0, previewLimit);
return (
preview +
`\n... (truncated; full ${text.length} chars saved to ${relPath}` +
` Read({file_path:"${relPath}", offset, limit}) で続きを取得可能)`
);
}
/**
* dumpHtml 用ヘルパー: 指定セレクタor bodyの outerHTML を取得し、
* script/style を除去した上で depth レベルを超える子孫を `…` に置き換える。
* 戻り値はサニタイズ済みの文字列(フル長)。要素が見つからない場合は null。
*
* frame は対象 Frameメインまたは iframe。Frame.evaluate は cross-origin
* iframe も透過的に扱えるので、resolveRef が返した frame をそのまま渡せばよい。
*/
async function dumpElementHtml(
frame: Frame,
selector: string | undefined,
depth: number,
): Promise<string | null> {
return await frame.evaluate(
(args: { selector: string | null; depth: number }) => {
const root = args.selector ? document.querySelector(args.selector) : document.body;
if (!root) return null;
const clone = root.cloneNode(true) as Element;
// Strip noise tags
const noisy = Array.from(clone.querySelectorAll('script, style, noscript, template'));
for (const n of noisy) n.remove();
// Truncate beyond depth
function truncate(el: Element, level: number): void {
if (level >= args.depth) {
const childCount = el.children.length;
if (childCount > 0) {
el.innerHTML = `<!-- truncated: ${childCount} children at depth ${level} -->`;
}
return;
}
for (const child of Array.from(el.children)) truncate(child, level + 1);
}
truncate(clone, 0);
return clone.outerHTML;
},
{ selector: selector ?? null, depth },
);
}
/** ref から (frame, selector) を解決。失敗時は null。 */
function resolveRef(page: Page, ref: string): RefTarget | null {
return _pageRefs.get(page)?.get(ref) ?? null;
}
function cssEscapeAttr(s: string): string {
return s.replace(/["\\]/g, '\\$&');
}
/**
* Build the iframe traversal chain from the main frame down to the target frame.
* Returns [] when the target IS the main frame.
*
* For each iframe boundary, picks the first stable attribute (id / name / src)
* whose selector uniquely matches within the parent frame. Falls back to a
* positional entry (selector: 'iframe', index: N) when no attribute is unique
* or when frameElement() throws (detached / race condition).
*/
async function buildFrameChain(targetFrame: Frame): Promise<FrameChainEntry[]> {
const chain: FrameChainEntry[] = [];
const page = targetFrame.page();
const mainFrame = page.mainFrame();
let current: Frame | null = targetFrame;
while (current && current !== mainFrame) {
const parent: Frame | null = current.parentFrame();
if (!parent) break;
let entry: FrameChainEntry;
try {
const el = await current.frameElement();
const id = await el.getAttribute('id').catch(() => null);
const name = await el.getAttribute('name').catch(() => null);
const src = await el.getAttribute('src').catch(() => null);
const candidates: string[] = [];
if (id) candidates.push(`iframe[id="${cssEscapeAttr(id)}"]`);
if (name) candidates.push(`iframe[name="${cssEscapeAttr(name)}"]`);
if (src) candidates.push(`iframe[src="${cssEscapeAttr(src)}"]`);
let chosen: string | null = null;
for (const sel of candidates) {
const count = await parent.locator(sel).count().catch(() => 0);
if (count === 1) { chosen = sel; break; }
}
if (chosen) {
entry = { selector: chosen };
} else {
const idx = parent.childFrames().indexOf(current);
entry = { selector: 'iframe', index: idx >= 0 ? idx : 0 };
}
} catch {
const idx = parent.childFrames().indexOf(current);
entry = { selector: 'iframe', index: idx >= 0 ? idx : 0 };
}
chain.unshift(entry);
current = parent;
}
return chain;
}
/** Convenience: returns [] for the main frame, otherwise the captured chain.
* Exported for testing.
*/
export async function captureFrameChain(frame: Frame): Promise<FrameChainEntry[]> {
if (frame === frame.page().mainFrame()) return [];
return buildFrameChain(frame);
}
/** ジョブ終了時に呼ぶ。そのジョブの BrowserContext だけを閉じる。 */
export async function cleanupJobContext(workspacePath: string): Promise<void> {
const context = _jobContexts.get(workspacePath);
if (context) {
await context.close().catch(() => {});
_jobContexts.delete(workspacePath);
logger.debug(`[browser] cleaned up job context for ${workspacePath}`);
}
}
/**
* Set up route interception on a page to block requests to private IPs (SSRF protection).
*/
async function setupRouteInterception(page: Page, allowedHosts: string[], workspacePath: string): Promise<void> {
await page.route('**/*', async (route) => {
const reqUrl = route.request().url();
let parsed: URL;
try {
parsed = new URL(reqUrl);
} catch {
await route.abort('blockedbyclient');
return;
}
const hostname = parsed.hostname;
if (parsed.protocol === 'file:') {
const normalized = normalizeFileUrlForWorkspace(reqUrl, workspacePath);
if ('error' in normalized) {
logger.warn(`[browser] blocked file URL outside workspace: ${reqUrl}`);
await route.abort('blockedbyclient');
return;
}
await route.continue({ url: normalized.url });
return;
}
if (parsed.protocol === 'data:' || parsed.protocol === 'blob:') {
await route.continue();
return;
}
// Skip check for allowed hosts
if (isHostAllowed(hostname, allowedHosts)) {
await route.continue();
return;
}
// Block localhost explicitly
if (hostname === 'localhost') {
logger.warn(`[browser] SSRF blocked navigation to localhost: ${reqUrl}`);
await route.abort('blockedbyclient');
return;
}
// DNS resolve and check for private IPs
try {
const result = await dns.promises.lookup(hostname);
if (isPrivateIPv4(result.address) || isPrivateIPv6(result.address)) {
logger.warn(`[browser] SSRF blocked: ${hostname} -> ${result.address}`);
await route.abort('blockedbyclient');
return;
}
} catch {
// DNS failure for network URLs: block the request.
await route.abort('blockedbyclient');
return;
}
await route.continue();
});
}
// --- BrowseWeb implementation ---
async function executeBrowseWeb(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const url = input['url'] as string | undefined;
const actions = input['actions'] as BrowseWebAction[] | undefined;
const recordTo = input['recordTo'] as string | undefined;
if (!url && (!actions || actions.length === 0)) {
return { output: 'BrowseWeb error: url または actions のいずれかが必要です', isError: true };
}
// Enable recorder for this task when recordTo is specified.
// Guard with recordTo() to avoid resetting an already-active buffer on
// a second BrowseWeb call for the same task (idempotent enable).
if (recordTo && ctx.taskId && ctx.userId) {
if (!recorder.recordTo(ctx.taskId)) {
recorder.enable(ctx.taskId, recordTo);
logger.debug(`[BrowseWeb] recorder enabled taskId=${ctx.taskId} recordTo=${recordTo}`);
}
}
// アクションモード
if (actions && actions.length > 0) {
return executeActions(actions, ctx, recordTo);
}
// 基本モード
return executeSimple(input, ctx);
}
async function executeSimple(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const url = input['url'] as string;
const waitFor = input['waitFor'] as string | undefined;
const extractSelector = input['extractSelector'] as string | undefined;
const screenshotFile = input['screenshot'] as string | undefined;
const pageTimeout = typeof input['timeout'] === 'number'
? input['timeout']
: (ctx.toolsConfig?.browserPageTimeout ?? 60000);
const allowedHosts = ctx.toolsConfig?.webfetchAllowedHosts ?? [];
// Pre-navigation SSRF check
const normalizedUrl = normalizeFileUrlForWorkspace(url, ctx.workspacePath);
if ('error' in normalizedUrl) {
return { output: `BrowseWeb error: ${normalizedUrl.error}`, isError: true };
}
const ssrfError = await ssrfCheck(normalizedUrl.url, allowedHosts, ctx.workspacePath);
if (ssrfError) {
return { output: `BrowseWeb error: ${ssrfError}`, isError: true };
}
try {
const page = await getJobPage(ctx, allowedHosts, pageTimeout);
await page.goto(normalizedUrl.url, { waitUntil: 'load', timeout: pageTimeout });
if (waitFor) {
await page.waitForSelector(waitFor, { timeout: pageTimeout });
}
const expiredReason = await checkAuthExpiry(page, ctx);
if (expiredReason) {
return { output: `AUTH_SESSION_EXPIRED: ${expiredReason}`, isError: true };
}
let content: string;
if (extractSelector) {
const el = await page.$(extractSelector);
if (el) {
const html = await el.innerHTML();
const fullText = htmlToText(html);
content = await saveBrowseText(ctx, page.url(), fullText, `selector:${extractSelector}`);
} else {
content = `(selector "${extractSelector}" not found on page)`;
}
} else {
// ref 注釈付きのスナップショットを取得(操作可能要素を埋め込み)
const fullText = await snapshotPage(page);
content = await saveBrowseText(ctx, page.url(), fullText, 'snapshot');
}
// スクリーンショット
if (screenshotFile) {
try {
const savePath = resolveOutputPathWithin(ctx.workspacePath, path.join('output', screenshotFile), ['output']);
const { mkdirSync } = await import('fs');
mkdirSync(path.dirname(savePath), { recursive: true });
await page.screenshot({ path: savePath, fullPage: true });
content += `\n\n[Screenshot saved to output/${screenshotFile}]`;
} catch (e) {
content += `\n\n[Screenshot error: ${(e as Error).message}]`;
}
}
// ページ滞在中に発生したファイルダウンロードを output/ に取り出してレポート
const downloads = await drainDownloads(page);
const dlSummary = formatDownloadLines(downloads);
if (dlSummary) content += `\n\n${dlSummary}`;
return { output: content, isError: false };
} catch (e) {
const msg = (e as Error).message ?? String(e);
logger.warn(`[BrowseWeb] error: ${msg}`);
return { output: `BrowseWeb error: ${msg}`, isError: true };
}
}
async function executeActions(
actions: BrowseWebAction[],
ctx: ToolContext,
recordTo?: string,
): Promise<ToolResult> {
const actionTimeout = ctx.toolsConfig?.browserActionTimeout ?? 30000;
const allowedHosts = ctx.toolsConfig?.webfetchAllowedHosts ?? [];
// Helper to record a successful action. Errors in recording must never fail the BrowseWeb action.
function tryRecord(entry: Omit<RecordedAction, 'ts'>): void {
if (!ctx.taskId || !recordTo) return;
if (!recorder.recordTo(ctx.taskId)) return;
try {
recorder.record(ctx.taskId, entry);
logger.debug(`[BrowseWeb] recorded action type=${entry.type} taskId=${ctx.taskId}`);
} catch (e) {
logger.debug(`[BrowseWeb] recorder.record failed (ignored): ${(e as Error).message}`);
}
}
try {
const page = await getJobPage(ctx, allowedHosts, actionTimeout);
const results: string[] = [];
for (const action of actions) {
switch (action.type) {
case 'goto': {
const gotoUrl = action.url;
if (!gotoUrl) {
results.push('[goto] error: url is required');
break;
}
const normalizedUrl = normalizeFileUrlForWorkspace(gotoUrl, ctx.workspacePath);
if ('error' in normalizedUrl) {
results.push(`[goto] error: ${normalizedUrl.error}`);
break;
}
const ssrfError = await ssrfCheck(normalizedUrl.url, allowedHosts, ctx.workspacePath);
if (ssrfError) {
results.push(`[goto] SSRF blocked: ${ssrfError}`);
break;
}
await page.goto(normalizedUrl.url, { waitUntil: 'load', timeout: actionTimeout });
results.push(`[goto] navigated to ${gotoUrl}`);
tryRecord({ type: 'goto', url: gotoUrl, frameChain: [] });
const expiredReason = await checkAuthExpiry(page, ctx);
if (expiredReason) {
return {
output: `AUTH_SESSION_EXPIRED: ${expiredReason}\n${results.join('\n')}`,
isError: true,
};
}
break;
}
case 'click': {
// ref があればそれを優先 (フレームを跨いで解決可能)。selector 直指定は
// メインフレームに対する操作とみなす。
let frame: Frame = page.mainFrame();
let selector = action.selector;
if (!selector && action.ref) {
const target = resolveRef(page, action.ref);
if (!target) {
results.push(`[click] ref "${action.ref}" not found in current snapshot. Get a fresh snapshot with getText first.`);
break;
}
frame = target.frame;
selector = target.selector;
}
if (!selector) {
results.push('[click] error: selector または ref が必要です');
break;
}
await frame.click(selector, { timeout: actionTimeout });
results.push(`[click] clicked ${action.ref ?? selector}`);
// Only resolve the DOM-based selector path when recording is active.
// buildSelectorPath runs a page.evaluate round-trip; skip it for
// non-recording BrowseWeb calls. tryRecord already early-returns when
// not recording, but this avoids the evaluate overhead entirely.
if (ctx.taskId && recorder.recordTo(ctx.taskId)) {
let resolvedSelector = selector;
try {
const locator = frame.locator(selector).first();
resolvedSelector = await buildSelectorPath(locator);
} catch (_e) {
// element may have detached — fall back to the raw selector
}
const frameChain = await captureFrameChain(frame);
tryRecord({ type: 'click', selector: resolvedSelector, originalRef: action.ref, frameChain });
}
break;
}
case 'fill': {
let frame: Frame = page.mainFrame();
let selector = action.selector;
if (!selector && action.ref) {
const target = resolveRef(page, action.ref);
if (!target) {
results.push(`[fill] ref "${action.ref}" not found in current snapshot.`);
break;
}
frame = target.frame;
selector = target.selector;
}
if (!selector) {
results.push('[fill] error: selector または ref が必要です');
break;
}
await frame.fill(selector, action.value ?? '', { timeout: actionTimeout });
results.push(`[fill] filled ${action.ref ?? selector}`);
// Only resolve the DOM-based selector path when recording is active.
if (ctx.taskId && recorder.recordTo(ctx.taskId)) {
let resolvedSelector = selector;
try {
const locator = frame.locator(selector).first();
resolvedSelector = await buildSelectorPath(locator);
} catch (_e) {
// element may have detached — fall back to the raw selector
}
const frameChain = await captureFrameChain(frame);
tryRecord({ type: 'fill', selector: resolvedSelector, originalRef: action.ref, value: action.value, frameChain });
}
break;
}
case 'screenshot': {
const filename = action.value ?? 'screenshot.png';
let savePath: string;
try {
savePath = resolveOutputPathWithin(ctx.workspacePath, path.join('output', filename), ['output']);
} catch (e) {
results.push(`[screenshot] error: ${(e as Error).message}`);
break;
}
const { mkdirSync } = await import('fs');
mkdirSync(path.dirname(savePath), { recursive: true });
await page.screenshot({ path: savePath, fullPage: true });
results.push(`[screenshot] saved to output/${filename}`);
tryRecord({ type: 'screenshot', value: filename, frameChain: [] });
break;
}
case 'getText': {
if (!action.selector) {
// 全ページの ref 注釈付きスナップショットを取得
const fullText = await snapshotPage(page);
const text = await saveBrowseText(ctx, page.url(), fullText, 'snapshot');
results.push(`[getText] ${text}`);
tryRecord({ type: 'getText', frameChain: [] });
} else {
const el = await page.$(action.selector);
if (el) {
const fullText = await el.innerText();
const text = await saveBrowseText(ctx, page.url(), fullText, `selector:${action.selector}`);
results.push(`[getText] ${text}`);
tryRecord({ type: 'getText', selector: action.selector, frameChain: [] });
} else {
results.push(`[getText] selector "${action.selector}" not found`);
}
}
break;
}
case 'wait': {
const ms = action.ms ?? 1000;
const waitMs = Math.min(ms, 30000); // cap at 30s
await page.waitForTimeout(waitMs);
results.push(`[wait] waited ${waitMs}ms`);
tryRecord({ type: 'wait', ms: waitMs, frameChain: [] });
break;
}
case 'dumpHtml': {
let frame: Frame = page.mainFrame();
let selector = action.selector;
if (!selector && action.ref) {
const target = resolveRef(page, action.ref);
if (!target) {
results.push(`[dumpHtml] ref "${action.ref}" not found in current snapshot.`);
break;
}
frame = target.frame;
selector = target.selector;
}
const depth = Math.max(0, Math.min(action.depth ?? 3, 10));
const fullHtml = await dumpElementHtml(frame, selector, depth);
if (fullHtml === null) {
results.push(`[dumpHtml] selector "${selector}" not found`);
break;
}
const previewLimit = Math.max(500, Math.min(action.maxChars ?? BROWSE_TEXT_PREVIEW_CHARS, 50_000));
const text = await saveBrowseText(
ctx,
page.url(),
fullHtml,
`dumpHtml:${action.ref ?? selector ?? 'body'}`,
previewLimit,
);
results.push(`[dumpHtml ${action.ref ?? selector ?? 'body'}] ${text}`);
if (ctx.taskId && recorder.recordTo(ctx.taskId)) {
const frameChain = await captureFrameChain(frame);
tryRecord({ type: 'dumpHtml', selector: selector ?? undefined, originalRef: action.ref, frameChain });
}
break;
}
default:
results.push(`[${action.type}] unknown action type`);
}
}
// アクション中に発生したファイルダウンロードを output/ に取り出してレポート
const downloads = await drainDownloads(page);
const dlSummary = formatDownloadLines(downloads);
if (dlSummary) results.push(dlSummary);
return { output: results.join('\n'), isError: false };
} catch (e) {
const msg = (e as Error).message ?? String(e);
logger.warn(`[BrowseWeb] action error: ${msg}`);
return { output: `BrowseWeb error: ${msg}`, isError: true };
}
}
// --- InteractiveBrowse / BrowseWithSession tool definitions ---
const INTERACTIVEBROWSE_DEF: ToolDef = {
type: 'function',
function: {
name: 'InteractiveBrowse',
description:
'ブラウザ操作をユーザーに委譲する。指定URLでブラウザ画面を起動し、noVNC経由でユーザーがその画面を直接見て手動操作できる。完了するとユーザーが release ボタンを押し、ジョブが再開して agent が BrowseWithSession で続きを引き継げるCookie / DOM 状態保持)。\n' +
'使うべき場面: (1) ログイン/2FA/SSO 同意などパスワード入力を伴う認証、(2) CAPTCHA や bot 検証、(3) BrowseWeb の click が空振りし続ける複雑な UIドラッグ&ドロップ、canvas/WebGL、closed shadow DOM、(4) 画面状態を目視確認したいとき。\n' +
'呼び出すとジョブは waiting_human に遷移し、ユーザー操作完了まで停止する。失敗 (noVNC 未構成) の場合はエラーが返るので BrowseWeb で続けるか諦めること。詳細は ReadToolDoc({ name: "InteractiveBrowse" }) で取得可能。',
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: '開くURL' },
reason: { type: 'string', description: 'ユーザーに操作を依頼する理由(例: ログインが必要です)' },
sessionId: { type: 'string', description: '既存セッションID省略時は新規作成' },
},
required: ['url', 'reason'],
},
},
};
const BROWSEWITHSESSION_DEF: ToolDef = {
type: 'function',
function: {
name: 'BrowseWithSession',
description:
'InteractiveBrowse でユーザーが手動操作した直後のセッションを使って agent が後続操作を続けるためのツール。Cookie・ログイン状態・DOM がそのまま引き継がれる。\n' +
'sessionId は InteractiveBrowse の戻り値から取得する。actionsgetText/screenshot/click/fillと selector/value で操作できる。詳細は ReadToolDoc({ name: "BrowseWithSession" }) で取得可能。',
parameters: {
type: 'object',
properties: {
sessionId: { type: 'string', description: 'セッションID' },
url: { type: 'string', description: '移動先URL' },
action: {
type: 'string',
enum: ['getText', 'screenshot', 'click', 'fill'],
description: '実行するアクション省略時はgetText',
},
selector: { type: 'string', description: 'CSSセレクタclick/fill/getText で使用)' },
value: { type: 'string', description: '入力値fill で使用)' },
},
required: ['sessionId', 'url'],
},
},
};
// --- InteractiveBrowse implementation ---
async function executeInteractiveBrowse(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const url = input['url'] as string;
const reason = input['reason'] as string;
const sessionId = input['sessionId'] as string | undefined;
const sm = getSessionManager();
if (!sm) {
return {
output: 'InteractiveBrowse error: ブラウザセッション機能が利用できませんXvfb, x11vnc, websockify が必要です)',
isError: true,
};
}
try {
let session: BrowserSession;
if (sessionId) {
const existing = sm.getSession(sessionId);
if (!existing) {
return { output: `InteractiveBrowse error: セッション ${sessionId} が見つかりません`, isError: true };
}
session = existing;
} else {
// ToolContext.taskId は worker → piece-runner で必ず埋まる (ローカルタスク
// 経由の場合のみ)。subtask root が gitea issue 等で taskId が立たないと
// visibility ベースの認可が取れないので、明示的に拒否する。
if (!ctx.taskId) {
return {
output: 'InteractiveBrowse error: このジョブには taskId が紐付いていないため、新規セッションを作れません (ローカルタスク経由で実行してください)',
isError: true,
};
}
session = await sm.getOrCreateTaskSession(ctx.taskId, ctx.userId);
}
sm.touchSession(session.id);
// Navigate to URL
if (session.context) {
const pages = session.context.pages();
const page = pages.length > 0 ? pages[0]! : await session.context.newPage();
await page.goto(url, { waitUntil: 'load', timeout: ctx.toolsConfig?.browserPageTimeout ?? 60000 });
}
session.state = 'user_interactive';
const resultData = {
action: 'waiting_human',
waitReason: 'browser_login',
sessionId: session.id,
novncPath: buildNovncPath(session.id),
reason,
};
return {
output: JSON.stringify(resultData),
isError: false,
};
} catch (e) {
const msg = (e as Error).message ?? String(e);
logger.warn(`[InteractiveBrowse] error: ${msg}`);
return { output: `InteractiveBrowse error: ${msg}`, isError: true };
}
}
// --- BrowseWithSession implementation ---
async function executeBrowseWithSession(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const sessionId = input['sessionId'] as string;
const url = input['url'] as string;
const action = (input['action'] as string | undefined) ?? 'getText';
const selector = input['selector'] as string | undefined;
const value = input['value'] as string | undefined;
const sm = getSessionManager();
if (!sm) {
return {
output: 'BrowseWithSession error: ブラウザセッション機能が利用できません',
isError: true,
};
}
const session = sm.getSession(sessionId);
if (!session) {
return { output: `BrowseWithSession error: セッション ${sessionId} が見つかりません`, isError: true };
}
if (!session.context) {
return { output: 'BrowseWithSession error: セッションのコンテキストが無効です', isError: true };
}
sm.touchSession(session.id);
try {
const pages = session.context.pages();
const page = pages.length > 0 ? pages[0]! : await session.context.newPage();
const pageTimeout = ctx.toolsConfig?.browserPageTimeout ?? 60000;
const actionTimeout = ctx.toolsConfig?.browserActionTimeout ?? 30000;
setupDownloadHandler(page, ctx.workspacePath);
await page.goto(url, { waitUntil: 'load', timeout: pageTimeout });
switch (action) {
case 'getText': {
if (selector) {
const el = await page.$(selector);
if (el) {
const fullText = await el.innerText();
const out = await saveBrowseText(ctx, page.url(), fullText, `selector:${selector}`);
return { output: out, isError: false };
}
return { output: `(selector "${selector}" not found)`, isError: false };
}
const html = await page.content();
const fullText = htmlToText(html);
const out = await saveBrowseText(ctx, page.url(), fullText, 'page');
return { output: out, isError: false };
}
case 'screenshot': {
const filename = `screenshot-${Date.now()}.png`;
const savePath = path.join(ctx.workspacePath, 'output', filename);
const { mkdirSync } = await import('fs');
mkdirSync(path.dirname(savePath), { recursive: true });
await page.screenshot({ path: savePath, fullPage: true });
return { output: `Screenshot saved to output/${filename}`, isError: false };
}
case 'click': {
if (!selector) {
return { output: 'BrowseWithSession error: click には selector が必要です', isError: true };
}
await page.click(selector, { timeout: actionTimeout });
const downloads = await drainDownloads(page);
const dlSummary = formatDownloadLines(downloads);
const msg = dlSummary ? `Clicked ${selector}\n${dlSummary}` : `Clicked ${selector}`;
return { output: msg, isError: false };
}
case 'fill': {
if (!selector) {
return { output: 'BrowseWithSession error: fill には selector が必要です', isError: true };
}
await page.fill(selector, value ?? '', { timeout: actionTimeout });
return { output: `Filled ${selector}`, isError: false };
}
default:
return { output: `BrowseWithSession error: unknown action "${action}"`, isError: true };
}
} catch (e) {
const msg = (e as Error).message ?? String(e);
logger.warn(`[BrowseWithSession] error: ${msg}`);
return { output: `BrowseWithSession error: ${msg}`, isError: true };
}
}
// --- buildSelectorPath ---
//
// Given a Playwright Locator, evaluates in the browser to build a stable CSS selector
// path using a priority list: data-testid > stable #id > [name="..."] > nth-of-type.
// Used by the recorder to store resolved selectors (not LLM refs) in the action buffer.
async function buildSelectorPath(locator: Locator): Promise<string> {
return await locator.evaluate((el: Element) => {
function tryStrictAttr(node: Element, attr: string): string | null {
// Strict: only simple alphanumeric/underscore/hyphen values (e.g. data-testid)
const v = node.getAttribute(attr);
return v && /^[a-zA-Z0-9_-]+$/.test(v) ? v : null;
}
function tryNameAttr(node: Element): string | null {
// More permissive: allow dots, slashes, colons common in Django/Rails form names
// (e.g. user.email, items[0], user:email). These are valid inside [name="..."].
const v = node.getAttribute('name');
return v && /^[a-zA-Z0-9_./:[\]-]+$/.test(v) ? v : null;
}
function isStableId(id: string): boolean {
// Reject auto-generated IDs from React/Vue (`:r-N:`, `__NEXT_*`, long GUIDs).
// Allow leading underscore (_root, _app) common in Next.js / SvelteKit.
return /^[a-zA-Z_][a-zA-Z0-9_-]*$/.test(id) && !id.startsWith(':') && id.length < 60;
}
function nthOfType(node: Element): number {
const siblings = Array.from(node.parentElement?.children ?? []).filter(
s => s.tagName === node.tagName,
);
return siblings.indexOf(node) + 1;
}
function pathFor(node: Element, depth: number): string {
const testId = tryStrictAttr(node, 'data-testid');
if (testId) return `[data-testid="${testId}"]`;
if (node.id && isStableId(node.id)) return `#${node.id}`;
const name = tryNameAttr(node);
const tag = node.tagName.toLowerCase();
if (name) return `${tag}[name="${name}"]`;
// For input elements, include type to be more specific (e.g. input[type="email"])
const type = (node as HTMLInputElement).type;
const typeFragment = node.tagName === 'INPUT' && type ? `[type="${type}"]` : '';
const nth = nthOfType(node);
const indexFragment = nth > 1 ? `:nth-of-type(${nth})` : '';
const local = `${tag}${typeFragment}${indexFragment}`;
if (depth === 0 || !node.parentElement) return local;
return `${pathFor(node.parentElement, depth - 1)} > ${local}`;
}
return pathFor(el, 4);
});
}
// --- Exports ---
const baseDefs: Record<string, ToolDef> = {
BrowseWeb: BROWSEWEB_DEF,
};
if (SessionManager.isAvailable()) {
baseDefs['InteractiveBrowse'] = INTERACTIVEBROWSE_DEF;
baseDefs['BrowseWithSession'] = BROWSEWITHSESSION_DEF;
}
export const TOOL_DEFS: Record<string, ToolDef> = baseDefs;
export async function executeTool(
name: string,
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult | null> {
switch (name) {
case 'BrowseWeb':
return executeBrowseWeb(input, ctx);
case 'InteractiveBrowse':
return executeInteractiveBrowse(input, ctx);
case 'BrowseWithSession':
return executeBrowseWithSession(input, ctx);
default:
return null;
}
}