2026-06-04 00:34:55 +00:00

1106 lines
40 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { ToolDef } from '../../llm/openai-compat.js';
import type { ToolContext, ToolResult } from './core.js';
import { resolveOutputPathWithin } from './core.js';
import { logger } from '../../logger.js';
import type { SearchFilterConfig } from '../../config.js';
import { checkSSRF, ssrfSafeFetch } from './shared/ssrf.js';
import { htmlToText } from './shared/html.js';
import * as fs from 'fs';
import * as path from 'path';
import * as crypto from 'crypto';
import {
looksLikeBinaryBytes,
decodeText,
SNIFF_HEAD_BYTES,
type BinaryVerdict,
} from './binary-detect.js';
const BINARY_CONTENT_TYPE_PREFIXES = [
'application/pdf',
'application/octet-stream',
'application/zip',
'application/x-zip-compressed',
'application/vnd.',
'image/',
'audio/',
'video/',
];
// WebFetch text body は最大 5MB で打ち切る(巨大 HTML による context 膨張防止)
const MAX_WEBFETCH_BODY_BYTES = 5 * 1024 * 1024;
// --- ツール定義 ---
const WEBSEARCH_DEF: ToolDef = {
type: 'function',
function: {
name: 'WebSearch',
description: 'Google 検索でインターネットを検索する。検索クエリにプライベートIP、内部ドメイン、メールアドレス、電話番号等の機密情報を含めないでください。詳細は ReadToolDoc({ name: "WebSearch" })。',
parameters: {
type: 'object',
properties: {
query: { type: 'string', description: '検索クエリ' },
limit: { type: 'number', description: '結果件数 (デフォルト: 5, 最大: 20)' },
},
required: ['query'],
},
},
};
const WEBFETCH_DEF: ToolDef = {
type: 'function',
function: {
name: 'WebFetch',
description: 'URL からページのテキスト内容を取得する(静的ページ向け、軽量・高速)。動的レンダリングや操作が必要なら BrowseWeb。詳細は ReadToolDoc({ name: "WebFetch" })。',
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: '取得する URL' },
selector: { type: 'string', description: 'CSS セレクタで特定要素だけ取得 (省略時はページ全体)' },
},
required: ['url'],
},
},
};
const DOWNLOADFILE_DEF: ToolDef = {
type: 'function',
function: {
name: 'DownloadFile',
description: 'URL からファイルPDF・画像・バイナリ等をダウンロードし、workspace の input/ または output/ に保存する。WebFetch では取得できない PDF や画像ファイルに使う。詳細・命名規約は ReadToolDoc({ name: "DownloadFile" })。',
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: 'ダウンロードする URL' },
filename: { type: 'string', description: '保存するファイル名(省略時は URL から推定)' },
section: { type: 'string', description: '保存先: "input" または "output" (デフォルト: "input")' },
},
required: ['url'],
},
},
};
// --- 検索クエリサニタイズ ---
// プライベート IPv4 アドレスにマッチ (10.*, 172.16-31.*, 192.168.*, 127.*)
const PRIVATE_IP_PATTERN = /\b(?:10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(?:1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3}|127\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/g;
// メールアドレス
const EMAIL_PATTERN = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
// 日本の電話番号 (090-1234-5678, 03-1234-5678, 0120-123-456 等)
const PHONE_PATTERN = /\b0\d{1,4}[-‐ー]\d{1,4}[-‐ー]\d{1,4}\b/g;
// 内部ドメイン (.local, .internal, .lan, .intranet, .corp, .home)
const INTERNAL_DOMAIN_PATTERN = /\b[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?\.(?:local|internal|lan|intranet|corp|home)\b/g;
/**
* 検索クエリから機密情報を除去する。
* @returns サニタイズ済みクエリ。空になった場合は null を返す。
*/
export function sanitizeQuery(query: string, config: SearchFilterConfig): string | null {
const autoBlock = {
privateIp: config.autoBlock?.privateIp !== false,
internalDomain: config.autoBlock?.internalDomain !== false,
email: config.autoBlock?.email !== false,
phone: config.autoBlock?.phone !== false,
};
let result = query;
// 自動ブロックパターンの適用
if (autoBlock.privateIp) {
result = result.replace(PRIVATE_IP_PATTERN, '');
}
if (autoBlock.email) {
result = result.replace(EMAIL_PATTERN, '');
}
if (autoBlock.phone) {
result = result.replace(PHONE_PATTERN, '');
}
if (autoBlock.internalDomain) {
result = result.replace(INTERNAL_DOMAIN_PATTERN, '');
}
// カスタムブロックパターンの適用
if (config.blockedPatterns) {
for (const pattern of config.blockedPatterns) {
result = result.split(pattern).join('');
}
}
// 空白の正規化
result = result.replace(/\s+/g, ' ').trim();
return result.length > 0 ? result : null;
}
/** Google 検索結果ページの innerText から URL + タイトル + スニペットを抽出する */
export function parseSearchResultsFromText(
text: string,
limit: number,
): Array<{ title: string; url: string; snippet: string }> {
const lines = text.split('\n');
const results: Array<{ title: string; url: string; snippet: string }> = [];
const urlPattern = /^(https?:\/\/[^\s]+)$/;
for (let i = 0; i < lines.length; i++) {
const match = lines[i]!.trim().match(urlPattern);
if (!match) continue;
const url = match[1]!;
// Google 内部リンクを除外
if (url.includes('google.com/search') || url.includes('google.com/imgres')) continue;
const title = (i > 0 ? lines[i - 1]!.trim() : '') || '(タイトルなし)';
const snippet = (i + 1 < lines.length ? lines[i + 1]!.trim() : '') || '';
results.push({ title, url, snippet });
if (results.length >= limit) break;
}
return results;
}
function isLikelyBinaryResponse(contentType: string): boolean {
const normalized = contentType.toLowerCase();
if (!normalized) return false;
if (normalized.startsWith('text/')) return false;
if (normalized.includes('json') || normalized.includes('xml') || normalized.includes('javascript') || normalized.includes('svg')) return false;
return BINARY_CONTENT_TYPE_PREFIXES.some((prefix) => normalized.startsWith(prefix));
}
function isPdfResponse(rawUrl: string, contentType: string): boolean {
const normalized = contentType.toLowerCase();
if (normalized.startsWith('application/pdf')) return true;
try {
return new URL(rawUrl).pathname.toLowerCase().endsWith('.pdf');
} catch {
return false;
}
}
type WebFetchHistoryRecord = {
timestamp: string;
url: string;
selector?: string;
status: number | null;
contentType: string | null;
outcome: 'success' | 'invalid_url' | 'ssrf_blocked' | 'http_error' | 'pdf_blocked' | 'binary_blocked' | 'error';
error?: string;
screenshotPath?: string;
};
type DownloadHistoryRecord = {
timestamp: string;
url: string;
savedPath: string | null;
status: number | null;
contentType: string | null;
bytes: number | null;
outcome: 'success' | 'invalid_url' | 'ssrf_blocked' | 'http_error' | 'error';
error?: string;
};
type WebSearchHistoryRecord = {
timestamp: string;
query: string;
limit: number;
method: string;
resultCount: number;
outcome: 'success' | 'captcha' | 'timeout' | 'error';
fallback: boolean;
error?: string;
};
export function appendWebSearchHistory(ctx: ToolContext, record: WebSearchHistoryRecord): void {
try {
const logsDir = path.join(ctx.workspacePath, 'logs');
const historyPath = path.join(logsDir, 'websearch-history.jsonl');
fs.mkdirSync(logsDir, { recursive: true });
fs.appendFileSync(historyPath, `${JSON.stringify(record)}\n`, 'utf-8');
} catch (e) {
logger.warn(`[WebSearch] failed to write history: ${(e as Error).message}`);
}
}
const DEFAULT_USER_AGENT =
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
export interface SearchResult {
title: string;
url: string;
snippet: string;
}
/** SearXNG API 経由の検索(フォールバック用) */
export async function searchViaSearxng(
query: string,
limit: number,
ctx: ToolContext,
): Promise<SearchResult[]> {
const searxngUrl = ctx.toolsConfig?.searxngUrl ?? 'http://searxng:8080';
const timeoutSec = ctx.toolsConfig?.websearchTimeout ?? 15;
const url = `${searxngUrl}/search?q=${encodeURIComponent(query)}&format=json&language=ja`;
logger.debug(`[WebSearch] SearXNG fallback: url=${url}`);
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutSec * 1000);
try {
const response = await fetch(url, { signal: controller.signal });
clearTimeout(timer);
if (!response.ok) {
throw new Error(`HTTP ${response.status} ${response.statusText}`);
}
const json = (await response.json()) as {
results?: Array<{ title?: string; url?: string; content?: string }>;
};
return (json.results ?? []).slice(0, limit).map((r) => ({
title: r.title ?? '(タイトルなし)',
url: r.url ?? '',
snippet: r.content ?? '',
}));
} catch (e) {
clearTimeout(timer);
throw e;
}
}
function appendDownloadHistory(ctx: ToolContext, record: DownloadHistoryRecord): void {
try {
const logsDir = path.join(ctx.workspacePath, 'logs');
const historyPath = path.join(logsDir, 'downloads.jsonl');
fs.mkdirSync(logsDir, { recursive: true });
fs.appendFileSync(historyPath, `${JSON.stringify(record)}\n`, 'utf-8');
} catch (e) {
logger.warn(`[DownloadFile] failed to write history: ${(e as Error).message}`);
}
}
function appendWebFetchHistory(ctx: ToolContext, record: WebFetchHistoryRecord): void {
try {
const logsDir = path.join(ctx.workspacePath, 'logs');
const historyPath = path.join(logsDir, 'webfetch-history.jsonl');
fs.mkdirSync(logsDir, { recursive: true });
fs.appendFileSync(historyPath, `${JSON.stringify(record)}\n`, 'utf-8');
} catch (e) {
logger.warn(`[WebFetch] failed to write history: ${(e as Error).message}`);
}
}
/**
* WebFetch 成功時に Playwright でファーストビューのスクショを撮り、
* logs/webfetch-screenshots/ に保存して dataUrl を返す。
* - vlmEnabled 無効時はスキップ
* - Playwright 利用不可・ネットワーク失敗時はログ出力のみで null を返すWebFetch 本体は成功として扱う)
*/
async function captureWebFetchScreenshot(
ctx: ToolContext,
url: string,
): Promise<{ dataUrl: string; savedPath: string } | null> {
if (!ctx.vlmEnabled) return null;
if (ctx.toolsConfig?.webfetchScreenshot === false) return null;
const timeoutMs = ctx.toolsConfig?.webfetchScreenshotTimeoutMs ?? 15_000;
let context: import('playwright').BrowserContext | null = null;
let page: import('playwright').Page | null = null;
try {
// WebFetch の preview スクショは "タスク横断で共有してよい" 操作なので CAPTCHA Pool 側を借りる。
// 個別タスクの BrowseWeb session は使わない (タスク隔離を保つ)。
const { getCaptchaPoolBrowser } = await import('./browser.js');
const browser = await getCaptchaPoolBrowser();
context = await browser.newContext({
viewport: { width: 1280, height: 1600 },
userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
page = await context.newPage();
await page.goto(url, { waitUntil: 'load', timeout: timeoutMs });
// ネットワークが落ち着くまで最大 3 秒だけ待つ(失敗しても続行)
await page.waitForLoadState('networkidle', { timeout: 3_000 }).catch(() => undefined);
const logsDir = path.join(ctx.workspacePath, 'logs', 'webfetch-screenshots');
fs.mkdirSync(logsDir, { recursive: true });
const hash = crypto.createHash('sha1').update(url).digest('hex').slice(0, 10);
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const filename = `${timestamp}-${hash}.png`;
const savedPath = path.join(logsDir, filename);
const buffer = await page.screenshot({ fullPage: false, type: 'png' });
fs.writeFileSync(savedPath, buffer);
const dataUrl = `data:image/png;base64,${buffer.toString('base64')}`;
logger.debug(`[WebFetch] screenshot saved: ${path.relative(ctx.workspacePath, savedPath)} (${buffer.byteLength} bytes)`);
return { dataUrl, savedPath };
} catch (e) {
const msg = (e as Error).message ?? String(e);
logger.warn(`[WebFetch] screenshot failed for ${url}: ${msg}`);
return null;
} finally {
if (page) await page.close().catch(() => undefined);
if (context) await context.close().catch(() => undefined);
}
}
// --- WebSearch 実装 (ブラウザ検索チェーン + SearXNG フォールバック) ---
/** CAPTCHA / bot 検出パターン(日本語・英語両対応) */
const CAPTCHA_PATTERN = /unusual traffic|CAPTCHA|captcha|通常と異なるトラフィック|ロボットではない|not a robot|verify you are human|Drag the slider/i;
/** 検索エンジン定義 */
interface SearchEngineConfig {
name: string;
buildUrl: (query: string, limit: number) => string;
/** DOM セレクタでの結果パーススクリプトlimit を埋め込む文字列テンプレート) */
domScript: (limit: number) => string;
/** 内部リンクを除外するパターンURL に含まれていたらスキップ) */
internalUrlPatterns?: string[];
}
const SEARCH_ENGINES: SearchEngineConfig[] = [
{
name: 'Google',
buildUrl: (q, limit) => `https://www.google.com/search?q=${encodeURIComponent(q)}&hl=ja&num=${limit}`,
domScript: (limit) => `
(function(maxResults) {
var items = [];
var seen = {};
// 2025-2026: Google は .g 以外に .MjjYud / .Gx5Zad / [data-sokoban-container] [jscontroller] も使う
var els = document.querySelectorAll('div.MjjYud, div.g, div.Gx5Zad, [data-sokoban-container] [jscontroller]');
for (var i = 0; i < els.length; i++) {
if (items.length >= maxResults) break;
var el = els[i];
var titleEl = el.querySelector('h3');
if (!titleEl) continue;
// h3 を内包する <a> が本当の結果リンク。最初の <a> だと「このページについて」等を拾ってしまう
var linkEl = titleEl.closest('a[href]');
if (!linkEl) continue;
var href = linkEl.getAttribute('href') || '';
if (!/^https?:\\/\\//.test(href)) continue;
if (href.indexOf('google.com/search') !== -1 || href.indexOf('google.com/imgres') !== -1) continue;
if (seen[href]) continue;
seen[href] = true;
var snippetEl = el.querySelector('[data-sncf], .VwiC3b, [style*="-webkit-line-clamp"]');
items.push({
title: (titleEl.textContent || '').trim(),
url: href,
snippet: snippetEl ? (snippetEl.textContent || '').trim() : ''
});
}
return items;
})(${limit})`,
},
{
name: 'Brave',
buildUrl: (q, limit) => `https://search.brave.com/search?q=${encodeURIComponent(q)}&source=web`,
domScript: (limit) => `
(function(maxResults) {
var items = [];
var els = document.querySelectorAll('.snippet');
for (var i = 0; i < els.length; i++) {
if (items.length >= maxResults) break;
var el = els[i];
var linkEl = el.querySelector('.result-header, a.heading[href]');
if (!linkEl) linkEl = el.querySelector('a[href]');
var titleEl = el.querySelector('.snippet-title');
if (!titleEl) titleEl = el.querySelector('h2, h3');
var snippetEl = el.querySelector('.snippet-description');
if (!snippetEl) snippetEl = el.querySelector('.snippet-content p, p');
if (linkEl) {
var href = linkEl.getAttribute('href') || '';
if (href.indexOf('/search') === 0 || href.indexOf('brave.com') !== -1) continue;
var title = titleEl ? (titleEl.textContent || '').trim() : (linkEl.textContent || '').trim();
if (!title) continue;
items.push({
title: title,
url: href,
snippet: snippetEl ? (snippetEl.textContent || '').trim() : ''
});
}
}
return items;
})(${limit})`,
},
{
name: 'Yahoo',
buildUrl: (q, limit) => `https://search.yahoo.co.jp/search?p=${encodeURIComponent(q)}&n=${limit}`,
domScript: (limit) => `
(function(maxResults) {
var items = [];
var els = document.querySelectorAll('#web .algo, #WS2m .sw-Card');
if (els.length === 0) els = document.querySelectorAll('.Sr');
for (var i = 0; i < els.length; i++) {
if (items.length >= maxResults) break;
var el = els[i];
var linkEl = el.querySelector('a[href]');
var titleEl = el.querySelector('h3, .sw-Card__title');
var snippetEl = el.querySelector('.sw-Card__description, p');
if (linkEl) {
var href = linkEl.getAttribute('href') || '';
var title = titleEl ? (titleEl.textContent || '').trim() : (linkEl.textContent || '').trim();
if (!title) continue;
items.push({
title: title,
url: href,
snippet: snippetEl ? (snippetEl.textContent || '').trim() : ''
});
}
}
return items;
})(${limit})`,
},
];
// --- 永続 BrowserContext 管理 (noVNC CAPTCHA 解決用) ---
const persistentContexts = new Map<string, import('playwright').BrowserContext>();
const captchaPages: Array<{ engineName: string; page: import('playwright').Page; createdAt: number }> = [];
async function getOrCreateContext(
engineName: string,
browser: import('playwright').Browser,
): Promise<import('playwright').BrowserContext> {
const existing = persistentContexts.get(engineName);
if (existing) {
// コンテキストがまだ有効かチェックpages() が例外を投げたら無効)
try {
if (browser.contexts().includes(existing)) {
return existing;
}
} catch {
// ignore
}
persistentContexts.delete(engineName);
}
const ctx = await browser.newContext({
userAgent: DEFAULT_USER_AGENT,
locale: 'ja-JP',
});
persistentContexts.set(engineName, ctx);
return ctx;
}
function trackCaptchaPage(
engineName: string,
page: import('playwright').Page,
maxPages: number,
): void {
captchaPages.push({ engineName, page, createdAt: Date.now() });
while (captchaPages.length > maxPages) {
const old = captchaPages.shift();
old?.page.close().catch(() => {});
}
}
/** 永続コンテキストと CAPTCHA ページ追跡をすべてクリア */
export function clearPersistentContexts(): void {
for (const { page } of captchaPages) {
page.close().catch(() => {});
}
captchaPages.length = 0;
for (const ctx of persistentContexts.values()) {
ctx.close().catch(() => {});
}
persistentContexts.clear();
}
/**
* CAPTCHA Pool の captchaPending フラグを更新する。
* SessionManager が import できるなら経由する (循環依存を避けるため動的 require)。
* - true: WebSearch が CAPTCHA を踏んだとき
* - false: 検索が成功したとき / admin が手動でクリアしたとき
*/
function markPoolCaptchaPending(pending: boolean): void {
// 動的 import (top-level だと core / browser-session の循環参照が発生しうる)
import('./browser.js')
.then((mod) => {
const sm = mod.getSessionManager?.();
sm?.markCaptchaPending?.(pending);
})
.catch(() => { /* ignore */ });
}
/** ブラウザで検索エンジンにアクセスして結果を取得する汎用関数 */
async function searchViaBrowser(
engine: SearchEngineConfig,
query: string,
limit: number,
pageTimeout: number,
captchaSolve: 'skip' | 'novnc' = 'skip',
maxCaptchaPages: number = 5,
): Promise<{ results: SearchResult[]; captcha: boolean }> {
const url = engine.buildUrl(query, limit);
logger.debug(`[WebSearch] ${engine.name} browser search: url=${url}`);
const useNovnc = captchaSolve === 'novnc';
// WebSearch は CAPTCHA Pool の Browser を共有して使う。これにより admin が
// 一度 CAPTCHA を解けば Cookie が persistentContexts に残り、別タスクの
// WebSearch も同じ Cookie で続行できる (タスク隔離が必要な BrowseWeb と
// 違って、WebSearch は "誰がやっても同じ検索" なので共有して良い)。
const { getCaptchaPoolBrowser } = await import('./browser.js');
const browser = await getCaptchaPoolBrowser();
let context: import('playwright').BrowserContext;
if (useNovnc) {
context = await getOrCreateContext(engine.name, browser);
} else {
context = await browser.newContext({
userAgent: DEFAULT_USER_AGENT,
locale: 'ja-JP',
});
}
let page: import('playwright').Page | null = null;
try {
page = await context.newPage();
page.setDefaultTimeout(pageTimeout);
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: pageTimeout });
// 結果コンテナが描画されるまで最大 3 秒待つ(失敗しても続行)。
// Google は h3 / #search / #rso のいずれかが JS 経由で挿入されるレイアウトがある。
await page.waitForSelector('h3, #search, #rso', { timeout: 3000 }).catch(() => undefined);
// タイトルから CAPTCHA 検出
const pageTitle = await page.evaluate(`document.title || ''`) as string;
if (CAPTCHA_PATTERN.test(pageTitle)) {
if (useNovnc) {
trackCaptchaPage(engine.name, page, maxCaptchaPages);
page = null; // ページを閉じない — noVNC で可視のまま
markPoolCaptchaPending(true);
}
return { results: [], captcha: true };
}
// 段階1: DOM セレクタでパース
const domResults = await page.evaluate(engine.domScript(limit)) as SearchResult[];
if (domResults.length > 0) {
return { results: domResults, captcha: false };
}
// 段階2: テキストフォールバック
const bodyText = await page.evaluate(
`(document.body && document.body.innerText || '').slice(0, 5000)`
) as string;
if (CAPTCHA_PATTERN.test(bodyText)) {
if (useNovnc) {
trackCaptchaPage(engine.name, page, maxCaptchaPages);
page = null;
markPoolCaptchaPending(true);
}
return { results: [], captcha: true };
}
const textResults = parseSearchResultsFromText(bodyText, limit);
return { results: textResults, captcha: false };
} finally {
if (page) {
await page.close().catch(() => {});
}
if (!useNovnc) {
await context.close().catch(() => {});
}
}
}
/** Playwright の TimeoutError を判別 */
function isTimeoutError(e: unknown): boolean {
return e instanceof Error && (
e.constructor.name === 'TimeoutError' || e.message.includes('Timeout')
);
}
async function executeWebSearch(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const rawQuery = input['query'] as string;
const rawLimit = typeof input['limit'] === 'number' ? input['limit'] : 5;
const limit = Math.min(Math.max(1, rawLimit), 20);
// 検索クエリの機密情報フィルタリング
const filterConfig = ctx.searchFilter ?? ctx.toolsConfig?.searchFilter ?? {};
const sanitized = sanitizeQuery(rawQuery, filterConfig);
if (sanitized === null) {
return { output: '検索クエリが機密情報のみで構成されています。クエリを見直してください。', isError: true };
}
const query = sanitized;
if (sanitized !== rawQuery) {
logger.info(`[WebSearch] クエリに機密情報が含まれていたためフィルタを適用しました`);
}
const pageTimeout = ctx.toolsConfig?.browserPageTimeout ?? 60000;
const { loadConfig } = await import('../../config.js');
const appConfig = loadConfig();
const captchaSolve = appConfig.browser?.captchaSolve ?? 'skip';
const maxCaptchaPages = appConfig.browser?.maxCaptchaPages ?? 5;
// --- ブラウザ検索: Google → Brave → Yahoo の順に試行 ---
let lastBrowserError: string | null = null;
for (let i = 0; i < SEARCH_ENGINES.length; i++) {
const engine = SEARCH_ENGINES[i]!;
const isFallback = i > 0;
const methodName = engine.name.toLowerCase();
try {
const { results, captcha } = await searchViaBrowser(engine, query, limit, pageTimeout, captchaSolve, maxCaptchaPages);
if (captcha) {
lastBrowserError = `${engine.name} が CAPTCHA を要求しました`;
logger.warn(`[WebSearch] ${lastBrowserError}, trying next engine`);
appendWebSearchHistory(ctx, {
timestamp: new Date().toISOString(),
query, limit, method: methodName,
resultCount: 0, outcome: 'captcha', fallback: isFallback,
});
continue;
}
if (results.length > 0) {
const formatted = formatResults(results);
appendWebSearchHistory(ctx, {
timestamp: new Date().toISOString(),
query, limit, method: methodName,
resultCount: results.length, outcome: 'success', fallback: isFallback,
});
// 検索が通った = Pool は CAPTCHA を抜けている。フラグを下ろす
if (captchaSolve === 'novnc') markPoolCaptchaPending(false);
return { output: formatted, isError: false };
}
// 結果0件 — 次のエンジンへ
appendWebSearchHistory(ctx, {
timestamp: new Date().toISOString(),
query, limit, method: methodName,
resultCount: 0, outcome: 'success', fallback: isFallback,
});
lastBrowserError = `${engine.name}: 結果なし`;
logger.info(`[WebSearch] ${engine.name} returned 0 results, trying next engine`);
} catch (e) {
const errMsg = (e as Error).message ?? String(e);
const outcome = isTimeoutError(e) ? 'timeout' as const : 'error' as const;
lastBrowserError = `${engine.name}: ${errMsg}`;
logger.warn(`[WebSearch] ${engine.name} failed: ${lastBrowserError}, trying next engine`);
appendWebSearchHistory(ctx, {
timestamp: new Date().toISOString(),
query, limit, method: methodName,
resultCount: 0, outcome, fallback: isFallback,
error: errMsg,
});
// Playwright 未インストール等のブラウザ起動エラーは残りのエンジンでも同じ結果になるため即スキップ
if (errMsg.includes("Executable doesn't exist") || errMsg.includes('browserType.launch') || errMsg.includes('Playwright browser unavailable')) {
logger.info(`[WebSearch] browser unavailable, skipping remaining engines and falling back to SearXNG`);
break;
}
}
}
// --- 最終フォールバック: SearXNG ---
try {
const results = await searchViaSearxng(query, limit, ctx);
if (results.length === 0) {
appendWebSearchHistory(ctx, {
timestamp: new Date().toISOString(),
query, limit, method: 'searxng',
resultCount: 0, outcome: 'success', fallback: true,
});
return { output: '(検索結果なし)', isError: false };
}
const formatted = formatResults(results);
appendWebSearchHistory(ctx, {
timestamp: new Date().toISOString(),
query, limit, method: 'searxng',
resultCount: results.length, outcome: 'success', fallback: true,
});
return { output: formatted, isError: false };
} catch (e) {
const searxngError = (e as Error).message ?? String(e);
logger.warn(`[WebSearch] SearXNG fallback also failed: ${searxngError}`);
appendWebSearchHistory(ctx, {
timestamp: new Date().toISOString(),
query, limit, method: 'searxng',
resultCount: 0, outcome: 'error', fallback: true,
error: searxngError,
});
return { output: `WebSearch error: 全検索エンジンが失敗しました。最後のエラー: ${lastBrowserError}`, isError: true };
}
}
function formatResults(results: SearchResult[]): string {
return results
.map((r, i) => {
const title = r.title || '(タイトルなし)';
return `[${i + 1}] ${title}\n ${r.url}\n ${r.snippet}`;
})
.join('\n\n');
}
// --- WebFetch 実装 ---
export type SniffResult =
| { binary: true; reason: string }
| { binary: false; text: string; truncated: boolean };
/**
* Stream a fetch Response body, sniff the first SNIFF_HEAD_BYTES for binary
* content, and either block (binary) or strict-decode the (capped) text body.
* Never uses response.text() — that silently produces U+FFFD from binary.
*/
export async function sniffAndDecodeBody(
response: { body: ReadableStream<Uint8Array> | null },
): Promise<SniffResult> {
const reader = response.body?.getReader();
if (!reader) return { binary: false, text: '', truncated: false };
const chunks: Buffer[] = [];
let total = 0;
let verdict: BinaryVerdict | null = null;
let truncated = false;
try {
for (;;) {
const { done, value } = await reader.read();
if (done) break;
if (!value || value.byteLength === 0) continue;
chunks.push(Buffer.from(value));
total += value.byteLength;
if (!verdict && total >= SNIFF_HEAD_BYTES) {
const head = Buffer.concat(chunks).subarray(0, SNIFF_HEAD_BYTES);
verdict = looksLikeBinaryBytes(head);
if (verdict.binary) {
await reader.cancel();
return { binary: true, reason: verdict.reason };
}
}
if (total >= MAX_WEBFETCH_BODY_BYTES) {
truncated = true;
await reader.cancel();
break;
}
}
} finally {
try { reader.releaseLock(); } catch { /* cancel() may or may not have released the lock depending on runtime */ }
}
let full = Buffer.concat(chunks);
if (truncated) full = full.subarray(0, MAX_WEBFETCH_BODY_BYTES);
if (!verdict) {
verdict = looksLikeBinaryBytes(full.subarray(0, SNIFF_HEAD_BYTES));
if (verdict.binary) return { binary: true, reason: verdict.reason };
}
const text = decodeText(full, verdict.encoding);
if (text === null) return { binary: true, reason: 'utf8-decode-fail' };
return { binary: false, text, truncated };
}
async function executeWebFetch(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const rawUrl = input['url'] as string;
// selector パラメータは受け取るが、cheerio なしでは CSS セレクト不可のため無視
// (仕様上 selector は省略可能で、対応できない旨をログに残す)
const selector = input['selector'] as string | undefined;
const timeoutSec = ctx.toolsConfig?.webfetchTimeout ?? 30;
const allowedHosts = ctx.toolsConfig?.webfetchAllowedHosts ?? [];
// URL パース
let parsedUrl: URL;
try {
parsedUrl = new URL(rawUrl);
} catch {
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: null,
contentType: null,
outcome: 'invalid_url',
error: `invalid URL "${rawUrl}"`,
});
return { output: `WebFetch error: invalid URL "${rawUrl}"`, isError: true };
}
const { hostname } = parsedUrl;
// SSRF チェック
try {
await checkSSRF(hostname, allowedHosts);
} catch (e) {
const error = (e as Error).message;
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: null,
contentType: null,
outcome: 'ssrf_blocked',
error,
});
return { output: `WebFetch error: ${error}`, isError: true };
}
if (selector) {
logger.debug(`[WebFetch] selector "${selector}" is specified but CSS selection is not supported without cheerio — fetching full page`);
}
logger.debug(`[WebFetch] fetching ${rawUrl}`);
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutSec * 1000);
try {
// ssrfSafeFetch re-validates each redirect hop so a public URL cannot
// 30x-bounce into a private/metadata address.
const response = await ssrfSafeFetch(rawUrl, allowedHosts, { signal: controller.signal });
clearTimeout(timer);
if (!response.ok) {
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: response.status,
contentType: response.headers.get('content-type'),
outcome: 'http_error',
error: `${response.status} ${response.statusText}`,
});
return {
output: `WebFetch error: HTTP ${response.status} ${response.statusText}. 403/429/5xx の場合は動的レンダリングや bot 検出の可能性があるので BrowseWeb で再試行してください。`,
isError: true,
};
}
const contentType = response.headers.get('content-type') ?? '';
if (isPdfResponse(rawUrl, contentType)) {
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: response.status,
contentType,
outcome: 'pdf_blocked',
error: 'PDF content blocked',
});
return {
output: `WebFetch cannot read PDF content from "${rawUrl}". Use ReadPdf for local PDF files instead of fetching the binary document as text.`,
isError: true,
};
}
if (isLikelyBinaryResponse(contentType)) {
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: response.status,
contentType,
outcome: 'binary_blocked',
error: `binary content-type "${contentType || 'unknown'}"`,
});
return {
output: `WebFetch cannot read binary content-type "${contentType || 'unknown'}" from "${rawUrl}". Fetch a text/HTML endpoint instead.`,
isError: true,
};
}
const sniffed = await sniffAndDecodeBody(response);
if (sniffed.binary) {
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: response.status,
contentType,
outcome: 'binary_blocked',
error: `binary content detected (${sniffed.reason})`,
});
return {
output: `WebFetch blocked binary content from "${rawUrl}" (detected: ${sniffed.reason}). コンテキストに展開していません。DownloadFile で input/ に保存し、ReadExcel/ReadPdf 等で処理してください。`,
isError: true,
};
}
const text =
htmlToText(sniffed.text) +
(sniffed.truncated ? '\n\n[truncated: body exceeded 5MB]' : '');
// vlmEnabled 時はファーストビューのスクショを並行取得して画像を添付する。
// 失敗時は警告ログのみで WebFetch 自体は成功扱いとする。
const screenshot = await captureWebFetchScreenshot(ctx, rawUrl);
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: response.status,
contentType,
outcome: 'success',
...(screenshot ? { screenshotPath: path.relative(ctx.workspacePath, screenshot.savedPath) } : {}),
});
const images = screenshot
? [{ dataUrl: screenshot.dataUrl, label: `WebFetch preview: ${rawUrl}` }]
: undefined;
return { output: text, isError: false, ...(images ? { images } : {}) };
} catch (e) {
clearTimeout(timer);
const msg = (e as Error).message ?? String(e);
logger.warn(`[WebFetch] error: ${msg}`);
appendWebFetchHistory(ctx, {
timestamp: new Date().toISOString(),
url: rawUrl,
selector,
status: null,
contentType: null,
outcome: 'error',
error: msg,
});
return {
output: `WebFetch error: ${msg}. 接続拒否・タイムアウト・動的ページの可能性があれば BrowseWeb で再試行してください。`,
isError: true,
};
}
}
// --- DownloadFile 実装 ---
async function executeDownloadFile(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const rawUrl = input['url'] as string;
const rawSection = typeof input['section'] === 'string' ? input['section'] : 'input';
const section = rawSection === 'output' ? 'output' : 'input';
const timeoutSec = ctx.toolsConfig?.webfetchTimeout ?? 60;
const allowedHosts = ctx.toolsConfig?.webfetchAllowedHosts ?? [];
// URL パース
let parsedUrl: URL;
try {
parsedUrl = new URL(rawUrl);
} catch {
appendDownloadHistory(ctx, {
timestamp: new Date().toISOString(), url: rawUrl, savedPath: null,
status: null, contentType: null, bytes: null, outcome: 'invalid_url',
error: `invalid URL "${rawUrl}"`,
});
return { output: `DownloadFile error: invalid URL "${rawUrl}"`, isError: true };
}
const { hostname } = parsedUrl;
// SSRF チェック
try {
await checkSSRF(hostname, allowedHosts);
} catch (e) {
const error = (e as Error).message;
appendDownloadHistory(ctx, {
timestamp: new Date().toISOString(), url: rawUrl, savedPath: null,
status: null, contentType: null, bytes: null, outcome: 'ssrf_blocked', error,
});
return { output: `DownloadFile error: ${error}`, isError: true };
}
// ファイル名解決
let filename: string;
if (typeof input['filename'] === 'string' && input['filename'].trim()) {
filename = input['filename'].trim().replace(/[\\/:*?"<>|]/g, '_');
} else {
const basename = path.basename(parsedUrl.pathname) || 'download';
filename = basename.replace(/[\\/:*?"<>|]/g, '_') || 'download';
}
// 保存先パスoutput/ or input/ 配下に制限)
let resolvedSavePath: string;
try {
resolvedSavePath = resolveOutputPathWithin(ctx.workspacePath, path.join(section, filename), [section]);
} catch (e) {
return { output: (e as Error).message, isError: true };
}
logger.debug(`[DownloadFile] downloading ${rawUrl} -> ${section}/${filename}`);
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutSec * 1000);
try {
const response = await ssrfSafeFetch(rawUrl, allowedHosts, { signal: controller.signal });
clearTimeout(timer);
if (!response.ok) {
appendDownloadHistory(ctx, {
timestamp: new Date().toISOString(), url: rawUrl, savedPath: null,
status: response.status, contentType: response.headers.get('content-type'),
bytes: null, outcome: 'http_error',
error: `${response.status} ${response.statusText}`,
});
return { output: `DownloadFile error: HTTP ${response.status} ${response.statusText}`, isError: true };
}
const contentType = response.headers.get('content-type') ?? '';
const buf = Buffer.from(await response.arrayBuffer());
fs.mkdirSync(path.dirname(resolvedSavePath), { recursive: true });
fs.writeFileSync(resolvedSavePath, buf);
appendDownloadHistory(ctx, {
timestamp: new Date().toISOString(), url: rawUrl,
savedPath: `${section}/${filename}`,
status: response.status, contentType,
bytes: buf.length, outcome: 'success',
});
return {
output: `Downloaded ${buf.length} bytes from ${rawUrl} -> ${section}/${filename} (content-type: ${contentType || 'unknown'})`,
isError: false,
};
} catch (e) {
clearTimeout(timer);
const msg = (e as Error).message ?? String(e);
logger.warn(`[DownloadFile] error: ${msg}`);
appendDownloadHistory(ctx, {
timestamp: new Date().toISOString(), url: rawUrl, savedPath: null,
status: null, contentType: null, bytes: null, outcome: 'error', error: msg,
});
return { output: `DownloadFile error: ${msg}`, isError: true };
}
}
// --- エクスポート ---
export const TOOL_DEFS: Record<string, ToolDef> = {
WebSearch: WEBSEARCH_DEF,
WebFetch: WEBFETCH_DEF,
DownloadFile: DOWNLOADFILE_DEF,
};
export async function executeTool(
name: string,
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult | null> {
switch (name) {
case 'WebSearch':
return executeWebSearch(input, ctx);
case 'WebFetch':
return executeWebFetch(input, ctx);
case 'DownloadFile':
return executeDownloadFile(input, ctx);
default:
return null;
}
}