import { ToolDef } from '../../llm/openai-compat.js'; import type { ToolContext, ToolResult } from './core.js'; import { resolveOutputPathWithin } from './core.js'; import { logger } from '../../logger.js'; import type { SearchFilterConfig } from '../../config.js'; import { checkSSRF, ssrfSafeFetch } from './shared/ssrf.js'; import { htmlToText } from './shared/html.js'; import * as fs from 'fs'; import * as path from 'path'; import * as crypto from 'crypto'; import { looksLikeBinaryBytes, decodeText, SNIFF_HEAD_BYTES, type BinaryVerdict, } from './binary-detect.js'; const BINARY_CONTENT_TYPE_PREFIXES = [ 'application/pdf', 'application/octet-stream', 'application/zip', 'application/x-zip-compressed', 'application/vnd.', 'image/', 'audio/', 'video/', ]; // WebFetch text body は最大 5MB で打ち切る(巨大 HTML による context 膨張防止) const MAX_WEBFETCH_BODY_BYTES = 5 * 1024 * 1024; // --- ツール定義 --- const WEBSEARCH_DEF: ToolDef = { type: 'function', function: { name: 'WebSearch', description: 'Google 検索でインターネットを検索する。検索クエリにプライベートIP、内部ドメイン、メールアドレス、電話番号等の機密情報を含めないでください。詳細は ReadToolDoc({ name: "WebSearch" })。', parameters: { type: 'object', properties: { query: { type: 'string', description: '検索クエリ' }, limit: { type: 'number', description: '結果件数 (デフォルト: 5, 最大: 20)' }, }, required: ['query'], }, }, }; const WEBFETCH_DEF: ToolDef = { type: 'function', function: { name: 'WebFetch', description: 'URL からページのテキスト内容を取得する(静的ページ向け、軽量・高速)。動的レンダリングや操作が必要なら BrowseWeb。詳細は ReadToolDoc({ name: "WebFetch" })。', parameters: { type: 'object', properties: { url: { type: 'string', description: '取得する URL' }, selector: { type: 'string', description: 'CSS セレクタで特定要素だけ取得 (省略時はページ全体)' }, }, required: ['url'], }, }, }; const DOWNLOADFILE_DEF: ToolDef = { type: 'function', function: { name: 'DownloadFile', description: 'URL からファイル(PDF・画像・バイナリ等)をダウンロードし、workspace の input/ または output/ に保存する。WebFetch では取得できない PDF や画像ファイルに使う。詳細・命名規約は ReadToolDoc({ name: "DownloadFile" })。', parameters: { type: 'object', properties: { url: { type: 'string', description: 'ダウンロードする URL' }, filename: { type: 'string', description: '保存するファイル名(省略時は URL から推定)' }, section: { type: 'string', description: '保存先: "input" または "output" (デフォルト: "input")' }, }, required: ['url'], }, }, }; // --- 検索クエリサニタイズ --- // プライベート IPv4 アドレスにマッチ (10.*, 172.16-31.*, 192.168.*, 127.*) const PRIVATE_IP_PATTERN = /\b(?:10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(?:1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3}|127\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/g; // メールアドレス const EMAIL_PATTERN = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g; // 日本の電話番号 (090-1234-5678, 03-1234-5678, 0120-123-456 等) const PHONE_PATTERN = /\b0\d{1,4}[-‐ー]\d{1,4}[-‐ー]\d{1,4}\b/g; // 内部ドメイン (.local, .internal, .lan, .intranet, .corp, .home) const INTERNAL_DOMAIN_PATTERN = /\b[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?\.(?:local|internal|lan|intranet|corp|home)\b/g; /** * 検索クエリから機密情報を除去する。 * @returns サニタイズ済みクエリ。空になった場合は null を返す。 */ export function sanitizeQuery(query: string, config: SearchFilterConfig): string | null { const autoBlock = { privateIp: config.autoBlock?.privateIp !== false, internalDomain: config.autoBlock?.internalDomain !== false, email: config.autoBlock?.email !== false, phone: config.autoBlock?.phone !== false, }; let result = query; // 自動ブロックパターンの適用 if (autoBlock.privateIp) { result = result.replace(PRIVATE_IP_PATTERN, ''); } if (autoBlock.email) { result = result.replace(EMAIL_PATTERN, ''); } if (autoBlock.phone) { result = result.replace(PHONE_PATTERN, ''); } if (autoBlock.internalDomain) { result = result.replace(INTERNAL_DOMAIN_PATTERN, ''); } // カスタムブロックパターンの適用 if (config.blockedPatterns) { for (const pattern of config.blockedPatterns) { result = result.split(pattern).join(''); } } // 空白の正規化 result = result.replace(/\s+/g, ' ').trim(); return result.length > 0 ? result : null; } /** Google 検索結果ページの innerText から URL + タイトル + スニペットを抽出する */ export function parseSearchResultsFromText( text: string, limit: number, ): Array<{ title: string; url: string; snippet: string }> { const lines = text.split('\n'); const results: Array<{ title: string; url: string; snippet: string }> = []; const urlPattern = /^(https?:\/\/[^\s]+)$/; for (let i = 0; i < lines.length; i++) { const match = lines[i]!.trim().match(urlPattern); if (!match) continue; const url = match[1]!; // Google 内部リンクを除外 if (url.includes('google.com/search') || url.includes('google.com/imgres')) continue; const title = (i > 0 ? lines[i - 1]!.trim() : '') || '(タイトルなし)'; const snippet = (i + 1 < lines.length ? lines[i + 1]!.trim() : '') || ''; results.push({ title, url, snippet }); if (results.length >= limit) break; } return results; } function isLikelyBinaryResponse(contentType: string): boolean { const normalized = contentType.toLowerCase(); if (!normalized) return false; if (normalized.startsWith('text/')) return false; if (normalized.includes('json') || normalized.includes('xml') || normalized.includes('javascript') || normalized.includes('svg')) return false; return BINARY_CONTENT_TYPE_PREFIXES.some((prefix) => normalized.startsWith(prefix)); } function isPdfResponse(rawUrl: string, contentType: string): boolean { const normalized = contentType.toLowerCase(); if (normalized.startsWith('application/pdf')) return true; try { return new URL(rawUrl).pathname.toLowerCase().endsWith('.pdf'); } catch { return false; } } type WebFetchHistoryRecord = { timestamp: string; url: string; selector?: string; status: number | null; contentType: string | null; outcome: 'success' | 'invalid_url' | 'ssrf_blocked' | 'http_error' | 'pdf_blocked' | 'binary_blocked' | 'error'; error?: string; screenshotPath?: string; }; type DownloadHistoryRecord = { timestamp: string; url: string; savedPath: string | null; status: number | null; contentType: string | null; bytes: number | null; outcome: 'success' | 'invalid_url' | 'ssrf_blocked' | 'http_error' | 'error'; error?: string; }; type WebSearchHistoryRecord = { timestamp: string; query: string; limit: number; method: string; resultCount: number; outcome: 'success' | 'captcha' | 'timeout' | 'error'; fallback: boolean; error?: string; }; export function appendWebSearchHistory(ctx: ToolContext, record: WebSearchHistoryRecord): void { try { const logsDir = path.join(ctx.workspacePath, 'logs'); const historyPath = path.join(logsDir, 'websearch-history.jsonl'); fs.mkdirSync(logsDir, { recursive: true }); fs.appendFileSync(historyPath, `${JSON.stringify(record)}\n`, 'utf-8'); } catch (e) { logger.warn(`[WebSearch] failed to write history: ${(e as Error).message}`); } } const DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'; export interface SearchResult { title: string; url: string; snippet: string; } /** SearXNG API 経由の検索(フォールバック用) */ export async function searchViaSearxng( query: string, limit: number, ctx: ToolContext, ): Promise { const searxngUrl = ctx.toolsConfig?.searxngUrl ?? 'http://searxng:8080'; const timeoutSec = ctx.toolsConfig?.websearchTimeout ?? 15; const url = `${searxngUrl}/search?q=${encodeURIComponent(query)}&format=json&language=ja`; logger.debug(`[WebSearch] SearXNG fallback: url=${url}`); const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutSec * 1000); try { const response = await fetch(url, { signal: controller.signal }); clearTimeout(timer); if (!response.ok) { throw new Error(`HTTP ${response.status} ${response.statusText}`); } const json = (await response.json()) as { results?: Array<{ title?: string; url?: string; content?: string }>; }; return (json.results ?? []).slice(0, limit).map((r) => ({ title: r.title ?? '(タイトルなし)', url: r.url ?? '', snippet: r.content ?? '', })); } catch (e) { clearTimeout(timer); throw e; } } function appendDownloadHistory(ctx: ToolContext, record: DownloadHistoryRecord): void { try { const logsDir = path.join(ctx.workspacePath, 'logs'); const historyPath = path.join(logsDir, 'downloads.jsonl'); fs.mkdirSync(logsDir, { recursive: true }); fs.appendFileSync(historyPath, `${JSON.stringify(record)}\n`, 'utf-8'); } catch (e) { logger.warn(`[DownloadFile] failed to write history: ${(e as Error).message}`); } } function appendWebFetchHistory(ctx: ToolContext, record: WebFetchHistoryRecord): void { try { const logsDir = path.join(ctx.workspacePath, 'logs'); const historyPath = path.join(logsDir, 'webfetch-history.jsonl'); fs.mkdirSync(logsDir, { recursive: true }); fs.appendFileSync(historyPath, `${JSON.stringify(record)}\n`, 'utf-8'); } catch (e) { logger.warn(`[WebFetch] failed to write history: ${(e as Error).message}`); } } /** * WebFetch 成功時に Playwright でファーストビューのスクショを撮り、 * logs/webfetch-screenshots/ に保存して dataUrl を返す。 * - vlmEnabled 無効時はスキップ * - Playwright 利用不可・ネットワーク失敗時はログ出力のみで null を返す(WebFetch 本体は成功として扱う) */ async function captureWebFetchScreenshot( ctx: ToolContext, url: string, ): Promise<{ dataUrl: string; savedPath: string } | null> { if (!ctx.vlmEnabled) return null; if (ctx.toolsConfig?.webfetchScreenshot === false) return null; const timeoutMs = ctx.toolsConfig?.webfetchScreenshotTimeoutMs ?? 15_000; let context: import('playwright').BrowserContext | null = null; let page: import('playwright').Page | null = null; try { // WebFetch の preview スクショは "タスク横断で共有してよい" 操作なので CAPTCHA Pool 側を借りる。 // 個別タスクの BrowseWeb session は使わない (タスク隔離を保つ)。 const { getCaptchaPoolBrowser } = await import('./browser.js'); const browser = await getCaptchaPoolBrowser(); context = await browser.newContext({ viewport: { width: 1280, height: 1600 }, userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }); page = await context.newPage(); await page.goto(url, { waitUntil: 'load', timeout: timeoutMs }); // ネットワークが落ち着くまで最大 3 秒だけ待つ(失敗しても続行) await page.waitForLoadState('networkidle', { timeout: 3_000 }).catch(() => undefined); const logsDir = path.join(ctx.workspacePath, 'logs', 'webfetch-screenshots'); fs.mkdirSync(logsDir, { recursive: true }); const hash = crypto.createHash('sha1').update(url).digest('hex').slice(0, 10); const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); const filename = `${timestamp}-${hash}.png`; const savedPath = path.join(logsDir, filename); const buffer = await page.screenshot({ fullPage: false, type: 'png' }); fs.writeFileSync(savedPath, buffer); const dataUrl = `data:image/png;base64,${buffer.toString('base64')}`; logger.debug(`[WebFetch] screenshot saved: ${path.relative(ctx.workspacePath, savedPath)} (${buffer.byteLength} bytes)`); return { dataUrl, savedPath }; } catch (e) { const msg = (e as Error).message ?? String(e); logger.warn(`[WebFetch] screenshot failed for ${url}: ${msg}`); return null; } finally { if (page) await page.close().catch(() => undefined); if (context) await context.close().catch(() => undefined); } } // --- WebSearch 実装 (ブラウザ検索チェーン + SearXNG フォールバック) --- /** CAPTCHA / bot 検出パターン(日本語・英語両対応) */ const CAPTCHA_PATTERN = /unusual traffic|CAPTCHA|captcha|通常と異なるトラフィック|ロボットではない|not a robot|verify you are human|Drag the slider/i; /** 検索エンジン定義 */ interface SearchEngineConfig { name: string; buildUrl: (query: string, limit: number) => string; /** DOM セレクタでの結果パーススクリプト(limit を埋め込む文字列テンプレート) */ domScript: (limit: number) => string; /** 内部リンクを除外するパターン(URL に含まれていたらスキップ) */ internalUrlPatterns?: string[]; } const SEARCH_ENGINES: SearchEngineConfig[] = [ { name: 'Google', buildUrl: (q, limit) => `https://www.google.com/search?q=${encodeURIComponent(q)}&hl=ja&num=${limit}`, domScript: (limit) => ` (function(maxResults) { var items = []; var seen = {}; // 2025-2026: Google は .g 以外に .MjjYud / .Gx5Zad / [data-sokoban-container] [jscontroller] も使う var els = document.querySelectorAll('div.MjjYud, div.g, div.Gx5Zad, [data-sokoban-container] [jscontroller]'); for (var i = 0; i < els.length; i++) { if (items.length >= maxResults) break; var el = els[i]; var titleEl = el.querySelector('h3'); if (!titleEl) continue; // h3 を内包する が本当の結果リンク。最初の だと「このページについて」等を拾ってしまう var linkEl = titleEl.closest('a[href]'); if (!linkEl) continue; var href = linkEl.getAttribute('href') || ''; if (!/^https?:\\/\\//.test(href)) continue; if (href.indexOf('google.com/search') !== -1 || href.indexOf('google.com/imgres') !== -1) continue; if (seen[href]) continue; seen[href] = true; var snippetEl = el.querySelector('[data-sncf], .VwiC3b, [style*="-webkit-line-clamp"]'); items.push({ title: (titleEl.textContent || '').trim(), url: href, snippet: snippetEl ? (snippetEl.textContent || '').trim() : '' }); } return items; })(${limit})`, }, { name: 'Brave', buildUrl: (q, limit) => `https://search.brave.com/search?q=${encodeURIComponent(q)}&source=web`, domScript: (limit) => ` (function(maxResults) { var items = []; var els = document.querySelectorAll('.snippet'); for (var i = 0; i < els.length; i++) { if (items.length >= maxResults) break; var el = els[i]; var linkEl = el.querySelector('.result-header, a.heading[href]'); if (!linkEl) linkEl = el.querySelector('a[href]'); var titleEl = el.querySelector('.snippet-title'); if (!titleEl) titleEl = el.querySelector('h2, h3'); var snippetEl = el.querySelector('.snippet-description'); if (!snippetEl) snippetEl = el.querySelector('.snippet-content p, p'); if (linkEl) { var href = linkEl.getAttribute('href') || ''; if (href.indexOf('/search') === 0 || href.indexOf('brave.com') !== -1) continue; var title = titleEl ? (titleEl.textContent || '').trim() : (linkEl.textContent || '').trim(); if (!title) continue; items.push({ title: title, url: href, snippet: snippetEl ? (snippetEl.textContent || '').trim() : '' }); } } return items; })(${limit})`, }, { name: 'Yahoo', buildUrl: (q, limit) => `https://search.yahoo.co.jp/search?p=${encodeURIComponent(q)}&n=${limit}`, domScript: (limit) => ` (function(maxResults) { var items = []; var els = document.querySelectorAll('#web .algo, #WS2m .sw-Card'); if (els.length === 0) els = document.querySelectorAll('.Sr'); for (var i = 0; i < els.length; i++) { if (items.length >= maxResults) break; var el = els[i]; var linkEl = el.querySelector('a[href]'); var titleEl = el.querySelector('h3, .sw-Card__title'); var snippetEl = el.querySelector('.sw-Card__description, p'); if (linkEl) { var href = linkEl.getAttribute('href') || ''; var title = titleEl ? (titleEl.textContent || '').trim() : (linkEl.textContent || '').trim(); if (!title) continue; items.push({ title: title, url: href, snippet: snippetEl ? (snippetEl.textContent || '').trim() : '' }); } } return items; })(${limit})`, }, ]; // --- 永続 BrowserContext 管理 (noVNC CAPTCHA 解決用) --- const persistentContexts = new Map(); const captchaPages: Array<{ engineName: string; page: import('playwright').Page; createdAt: number }> = []; async function getOrCreateContext( engineName: string, browser: import('playwright').Browser, ): Promise { const existing = persistentContexts.get(engineName); if (existing) { // コンテキストがまだ有効かチェック(pages() が例外を投げたら無効) try { if (browser.contexts().includes(existing)) { return existing; } } catch { // ignore } persistentContexts.delete(engineName); } const ctx = await browser.newContext({ userAgent: DEFAULT_USER_AGENT, locale: 'ja-JP', }); persistentContexts.set(engineName, ctx); return ctx; } function trackCaptchaPage( engineName: string, page: import('playwright').Page, maxPages: number, ): void { captchaPages.push({ engineName, page, createdAt: Date.now() }); while (captchaPages.length > maxPages) { const old = captchaPages.shift(); old?.page.close().catch(() => {}); } } /** 永続コンテキストと CAPTCHA ページ追跡をすべてクリア */ export function clearPersistentContexts(): void { for (const { page } of captchaPages) { page.close().catch(() => {}); } captchaPages.length = 0; for (const ctx of persistentContexts.values()) { ctx.close().catch(() => {}); } persistentContexts.clear(); } /** * CAPTCHA Pool の captchaPending フラグを更新する。 * SessionManager が import できるなら経由する (循環依存を避けるため動的 require)。 * - true: WebSearch が CAPTCHA を踏んだとき * - false: 検索が成功したとき / admin が手動でクリアしたとき */ function markPoolCaptchaPending(pending: boolean): void { // 動的 import (top-level だと core / browser-session の循環参照が発生しうる) import('./browser.js') .then((mod) => { const sm = mod.getSessionManager?.(); sm?.markCaptchaPending?.(pending); }) .catch(() => { /* ignore */ }); } /** ブラウザで検索エンジンにアクセスして結果を取得する汎用関数 */ async function searchViaBrowser( engine: SearchEngineConfig, query: string, limit: number, pageTimeout: number, captchaSolve: 'skip' | 'novnc' = 'skip', maxCaptchaPages: number = 5, ): Promise<{ results: SearchResult[]; captcha: boolean }> { const url = engine.buildUrl(query, limit); logger.debug(`[WebSearch] ${engine.name} browser search: url=${url}`); const useNovnc = captchaSolve === 'novnc'; // WebSearch は CAPTCHA Pool の Browser を共有して使う。これにより admin が // 一度 CAPTCHA を解けば Cookie が persistentContexts に残り、別タスクの // WebSearch も同じ Cookie で続行できる (タスク隔離が必要な BrowseWeb と // 違って、WebSearch は "誰がやっても同じ検索" なので共有して良い)。 const { getCaptchaPoolBrowser } = await import('./browser.js'); const browser = await getCaptchaPoolBrowser(); let context: import('playwright').BrowserContext; if (useNovnc) { context = await getOrCreateContext(engine.name, browser); } else { context = await browser.newContext({ userAgent: DEFAULT_USER_AGENT, locale: 'ja-JP', }); } let page: import('playwright').Page | null = null; try { page = await context.newPage(); page.setDefaultTimeout(pageTimeout); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: pageTimeout }); // 結果コンテナが描画されるまで最大 3 秒待つ(失敗しても続行)。 // Google は h3 / #search / #rso のいずれかが JS 経由で挿入されるレイアウトがある。 await page.waitForSelector('h3, #search, #rso', { timeout: 3000 }).catch(() => undefined); // タイトルから CAPTCHA 検出 const pageTitle = await page.evaluate(`document.title || ''`) as string; if (CAPTCHA_PATTERN.test(pageTitle)) { if (useNovnc) { trackCaptchaPage(engine.name, page, maxCaptchaPages); page = null; // ページを閉じない — noVNC で可視のまま markPoolCaptchaPending(true); } return { results: [], captcha: true }; } // 段階1: DOM セレクタでパース const domResults = await page.evaluate(engine.domScript(limit)) as SearchResult[]; if (domResults.length > 0) { return { results: domResults, captcha: false }; } // 段階2: テキストフォールバック const bodyText = await page.evaluate( `(document.body && document.body.innerText || '').slice(0, 5000)` ) as string; if (CAPTCHA_PATTERN.test(bodyText)) { if (useNovnc) { trackCaptchaPage(engine.name, page, maxCaptchaPages); page = null; markPoolCaptchaPending(true); } return { results: [], captcha: true }; } const textResults = parseSearchResultsFromText(bodyText, limit); return { results: textResults, captcha: false }; } finally { if (page) { await page.close().catch(() => {}); } if (!useNovnc) { await context.close().catch(() => {}); } } } /** Playwright の TimeoutError を判別 */ function isTimeoutError(e: unknown): boolean { return e instanceof Error && ( e.constructor.name === 'TimeoutError' || e.message.includes('Timeout') ); } async function executeWebSearch( input: Record, ctx: ToolContext, ): Promise { const rawQuery = input['query'] as string; const rawLimit = typeof input['limit'] === 'number' ? input['limit'] : 5; const limit = Math.min(Math.max(1, rawLimit), 20); // 検索クエリの機密情報フィルタリング const filterConfig = ctx.searchFilter ?? ctx.toolsConfig?.searchFilter ?? {}; const sanitized = sanitizeQuery(rawQuery, filterConfig); if (sanitized === null) { return { output: '検索クエリが機密情報のみで構成されています。クエリを見直してください。', isError: true }; } const query = sanitized; if (sanitized !== rawQuery) { logger.info(`[WebSearch] クエリに機密情報が含まれていたためフィルタを適用しました`); } const pageTimeout = ctx.toolsConfig?.browserPageTimeout ?? 60000; const { loadConfig } = await import('../../config.js'); const appConfig = loadConfig(); const captchaSolve = appConfig.browser?.captchaSolve ?? 'skip'; const maxCaptchaPages = appConfig.browser?.maxCaptchaPages ?? 5; // --- ブラウザ検索: Google → Brave → Yahoo の順に試行 --- let lastBrowserError: string | null = null; for (let i = 0; i < SEARCH_ENGINES.length; i++) { const engine = SEARCH_ENGINES[i]!; const isFallback = i > 0; const methodName = engine.name.toLowerCase(); try { const { results, captcha } = await searchViaBrowser(engine, query, limit, pageTimeout, captchaSolve, maxCaptchaPages); if (captcha) { lastBrowserError = `${engine.name} が CAPTCHA を要求しました`; logger.warn(`[WebSearch] ${lastBrowserError}, trying next engine`); appendWebSearchHistory(ctx, { timestamp: new Date().toISOString(), query, limit, method: methodName, resultCount: 0, outcome: 'captcha', fallback: isFallback, }); continue; } if (results.length > 0) { const formatted = formatResults(results); appendWebSearchHistory(ctx, { timestamp: new Date().toISOString(), query, limit, method: methodName, resultCount: results.length, outcome: 'success', fallback: isFallback, }); // 検索が通った = Pool は CAPTCHA を抜けている。フラグを下ろす if (captchaSolve === 'novnc') markPoolCaptchaPending(false); return { output: formatted, isError: false }; } // 結果0件 — 次のエンジンへ appendWebSearchHistory(ctx, { timestamp: new Date().toISOString(), query, limit, method: methodName, resultCount: 0, outcome: 'success', fallback: isFallback, }); lastBrowserError = `${engine.name}: 結果なし`; logger.info(`[WebSearch] ${engine.name} returned 0 results, trying next engine`); } catch (e) { const errMsg = (e as Error).message ?? String(e); const outcome = isTimeoutError(e) ? 'timeout' as const : 'error' as const; lastBrowserError = `${engine.name}: ${errMsg}`; logger.warn(`[WebSearch] ${engine.name} failed: ${lastBrowserError}, trying next engine`); appendWebSearchHistory(ctx, { timestamp: new Date().toISOString(), query, limit, method: methodName, resultCount: 0, outcome, fallback: isFallback, error: errMsg, }); // Playwright 未インストール等のブラウザ起動エラーは残りのエンジンでも同じ結果になるため即スキップ if (errMsg.includes("Executable doesn't exist") || errMsg.includes('browserType.launch') || errMsg.includes('Playwright browser unavailable')) { logger.info(`[WebSearch] browser unavailable, skipping remaining engines and falling back to SearXNG`); break; } } } // --- 最終フォールバック: SearXNG --- try { const results = await searchViaSearxng(query, limit, ctx); if (results.length === 0) { appendWebSearchHistory(ctx, { timestamp: new Date().toISOString(), query, limit, method: 'searxng', resultCount: 0, outcome: 'success', fallback: true, }); return { output: '(検索結果なし)', isError: false }; } const formatted = formatResults(results); appendWebSearchHistory(ctx, { timestamp: new Date().toISOString(), query, limit, method: 'searxng', resultCount: results.length, outcome: 'success', fallback: true, }); return { output: formatted, isError: false }; } catch (e) { const searxngError = (e as Error).message ?? String(e); logger.warn(`[WebSearch] SearXNG fallback also failed: ${searxngError}`); appendWebSearchHistory(ctx, { timestamp: new Date().toISOString(), query, limit, method: 'searxng', resultCount: 0, outcome: 'error', fallback: true, error: searxngError, }); return { output: `WebSearch error: 全検索エンジンが失敗しました。最後のエラー: ${lastBrowserError}`, isError: true }; } } function formatResults(results: SearchResult[]): string { return results .map((r, i) => { const title = r.title || '(タイトルなし)'; return `[${i + 1}] ${title}\n ${r.url}\n ${r.snippet}`; }) .join('\n\n'); } // --- WebFetch 実装 --- export type SniffResult = | { binary: true; reason: string } | { binary: false; text: string; truncated: boolean }; /** * Stream a fetch Response body, sniff the first SNIFF_HEAD_BYTES for binary * content, and either block (binary) or strict-decode the (capped) text body. * Never uses response.text() — that silently produces U+FFFD from binary. */ export async function sniffAndDecodeBody( response: { body: ReadableStream | null }, ): Promise { const reader = response.body?.getReader(); if (!reader) return { binary: false, text: '', truncated: false }; const chunks: Buffer[] = []; let total = 0; let verdict: BinaryVerdict | null = null; let truncated = false; try { for (;;) { const { done, value } = await reader.read(); if (done) break; if (!value || value.byteLength === 0) continue; chunks.push(Buffer.from(value)); total += value.byteLength; if (!verdict && total >= SNIFF_HEAD_BYTES) { const head = Buffer.concat(chunks).subarray(0, SNIFF_HEAD_BYTES); verdict = looksLikeBinaryBytes(head); if (verdict.binary) { await reader.cancel(); return { binary: true, reason: verdict.reason }; } } if (total >= MAX_WEBFETCH_BODY_BYTES) { truncated = true; await reader.cancel(); break; } } } finally { try { reader.releaseLock(); } catch { /* cancel() may or may not have released the lock depending on runtime */ } } let full = Buffer.concat(chunks); if (truncated) full = full.subarray(0, MAX_WEBFETCH_BODY_BYTES); if (!verdict) { verdict = looksLikeBinaryBytes(full.subarray(0, SNIFF_HEAD_BYTES)); if (verdict.binary) return { binary: true, reason: verdict.reason }; } const text = decodeText(full, verdict.encoding); if (text === null) return { binary: true, reason: 'utf8-decode-fail' }; return { binary: false, text, truncated }; } async function executeWebFetch( input: Record, ctx: ToolContext, ): Promise { const rawUrl = input['url'] as string; // selector パラメータは受け取るが、cheerio なしでは CSS セレクト不可のため無視 // (仕様上 selector は省略可能で、対応できない旨をログに残す) const selector = input['selector'] as string | undefined; const timeoutSec = ctx.toolsConfig?.webfetchTimeout ?? 30; const allowedHosts = ctx.toolsConfig?.webfetchAllowedHosts ?? []; // URL パース let parsedUrl: URL; try { parsedUrl = new URL(rawUrl); } catch { appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: null, contentType: null, outcome: 'invalid_url', error: `invalid URL "${rawUrl}"`, }); return { output: `WebFetch error: invalid URL "${rawUrl}"`, isError: true }; } const { hostname } = parsedUrl; // SSRF チェック try { await checkSSRF(hostname, allowedHosts); } catch (e) { const error = (e as Error).message; appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: null, contentType: null, outcome: 'ssrf_blocked', error, }); return { output: `WebFetch error: ${error}`, isError: true }; } if (selector) { logger.debug(`[WebFetch] selector "${selector}" is specified but CSS selection is not supported without cheerio — fetching full page`); } logger.debug(`[WebFetch] fetching ${rawUrl}`); const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutSec * 1000); try { // ssrfSafeFetch re-validates each redirect hop so a public URL cannot // 30x-bounce into a private/metadata address. const response = await ssrfSafeFetch(rawUrl, allowedHosts, { signal: controller.signal }); clearTimeout(timer); if (!response.ok) { appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: response.status, contentType: response.headers.get('content-type'), outcome: 'http_error', error: `${response.status} ${response.statusText}`, }); return { output: `WebFetch error: HTTP ${response.status} ${response.statusText}. 403/429/5xx の場合は動的レンダリングや bot 検出の可能性があるので BrowseWeb で再試行してください。`, isError: true, }; } const contentType = response.headers.get('content-type') ?? ''; if (isPdfResponse(rawUrl, contentType)) { appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: response.status, contentType, outcome: 'pdf_blocked', error: 'PDF content blocked', }); return { output: `WebFetch cannot read PDF content from "${rawUrl}". Use ReadPdf for local PDF files instead of fetching the binary document as text.`, isError: true, }; } if (isLikelyBinaryResponse(contentType)) { appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: response.status, contentType, outcome: 'binary_blocked', error: `binary content-type "${contentType || 'unknown'}"`, }); return { output: `WebFetch cannot read binary content-type "${contentType || 'unknown'}" from "${rawUrl}". Fetch a text/HTML endpoint instead.`, isError: true, }; } const sniffed = await sniffAndDecodeBody(response); if (sniffed.binary) { appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: response.status, contentType, outcome: 'binary_blocked', error: `binary content detected (${sniffed.reason})`, }); return { output: `WebFetch blocked binary content from "${rawUrl}" (detected: ${sniffed.reason}). コンテキストに展開していません。DownloadFile で input/ に保存し、ReadExcel/ReadPdf 等で処理してください。`, isError: true, }; } const text = htmlToText(sniffed.text) + (sniffed.truncated ? '\n\n[truncated: body exceeded 5MB]' : ''); // vlmEnabled 時はファーストビューのスクショを並行取得して画像を添付する。 // 失敗時は警告ログのみで WebFetch 自体は成功扱いとする。 const screenshot = await captureWebFetchScreenshot(ctx, rawUrl); appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: response.status, contentType, outcome: 'success', ...(screenshot ? { screenshotPath: path.relative(ctx.workspacePath, screenshot.savedPath) } : {}), }); const images = screenshot ? [{ dataUrl: screenshot.dataUrl, label: `WebFetch preview: ${rawUrl}` }] : undefined; return { output: text, isError: false, ...(images ? { images } : {}) }; } catch (e) { clearTimeout(timer); const msg = (e as Error).message ?? String(e); logger.warn(`[WebFetch] error: ${msg}`); appendWebFetchHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, selector, status: null, contentType: null, outcome: 'error', error: msg, }); return { output: `WebFetch error: ${msg}. 接続拒否・タイムアウト・動的ページの可能性があれば BrowseWeb で再試行してください。`, isError: true, }; } } // --- DownloadFile 実装 --- async function executeDownloadFile( input: Record, ctx: ToolContext, ): Promise { const rawUrl = input['url'] as string; const rawSection = typeof input['section'] === 'string' ? input['section'] : 'input'; const section = rawSection === 'output' ? 'output' : 'input'; const timeoutSec = ctx.toolsConfig?.webfetchTimeout ?? 60; const allowedHosts = ctx.toolsConfig?.webfetchAllowedHosts ?? []; // URL パース let parsedUrl: URL; try { parsedUrl = new URL(rawUrl); } catch { appendDownloadHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, savedPath: null, status: null, contentType: null, bytes: null, outcome: 'invalid_url', error: `invalid URL "${rawUrl}"`, }); return { output: `DownloadFile error: invalid URL "${rawUrl}"`, isError: true }; } const { hostname } = parsedUrl; // SSRF チェック try { await checkSSRF(hostname, allowedHosts); } catch (e) { const error = (e as Error).message; appendDownloadHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, savedPath: null, status: null, contentType: null, bytes: null, outcome: 'ssrf_blocked', error, }); return { output: `DownloadFile error: ${error}`, isError: true }; } // ファイル名解決 let filename: string; if (typeof input['filename'] === 'string' && input['filename'].trim()) { filename = input['filename'].trim().replace(/[\\/:*?"<>|]/g, '_'); } else { const basename = path.basename(parsedUrl.pathname) || 'download'; filename = basename.replace(/[\\/:*?"<>|]/g, '_') || 'download'; } // 保存先パス(output/ or input/ 配下に制限) let resolvedSavePath: string; try { resolvedSavePath = resolveOutputPathWithin(ctx.workspacePath, path.join(section, filename), [section]); } catch (e) { return { output: (e as Error).message, isError: true }; } logger.debug(`[DownloadFile] downloading ${rawUrl} -> ${section}/${filename}`); const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutSec * 1000); try { const response = await ssrfSafeFetch(rawUrl, allowedHosts, { signal: controller.signal }); clearTimeout(timer); if (!response.ok) { appendDownloadHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, savedPath: null, status: response.status, contentType: response.headers.get('content-type'), bytes: null, outcome: 'http_error', error: `${response.status} ${response.statusText}`, }); return { output: `DownloadFile error: HTTP ${response.status} ${response.statusText}`, isError: true }; } const contentType = response.headers.get('content-type') ?? ''; const buf = Buffer.from(await response.arrayBuffer()); fs.mkdirSync(path.dirname(resolvedSavePath), { recursive: true }); fs.writeFileSync(resolvedSavePath, buf); appendDownloadHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, savedPath: `${section}/${filename}`, status: response.status, contentType, bytes: buf.length, outcome: 'success', }); return { output: `Downloaded ${buf.length} bytes from ${rawUrl} -> ${section}/${filename} (content-type: ${contentType || 'unknown'})`, isError: false, }; } catch (e) { clearTimeout(timer); const msg = (e as Error).message ?? String(e); logger.warn(`[DownloadFile] error: ${msg}`); appendDownloadHistory(ctx, { timestamp: new Date().toISOString(), url: rawUrl, savedPath: null, status: null, contentType: null, bytes: null, outcome: 'error', error: msg, }); return { output: `DownloadFile error: ${msg}`, isError: true }; } } // --- エクスポート --- export const TOOL_DEFS: Record = { WebSearch: WEBSEARCH_DEF, WebFetch: WEBFETCH_DEF, DownloadFile: DOWNLOADFILE_DEF, }; export async function executeTool( name: string, input: Record, ctx: ToolContext, ): Promise { switch (name) { case 'WebSearch': return executeWebSearch(input, ctx); case 'WebFetch': return executeWebFetch(input, ctx); case 'DownloadFile': return executeDownloadFile(input, ctx); default: return null; } }