import * as fs from 'fs'; import * as path from 'path'; import MsgReaderImport from '@kenjiuno/msgreader'; import { ToolDef } from '../../llm/openai-compat.js'; import type { ToolContext, ToolResult } from './core.js'; import { resolveAndGuard, truncateToBudget, getToolOutputBudgetTokens } from './core.js'; import { logger } from '../../logger.js'; // CJS/ESM interop: under native Node ESM (the built dist), a default import of // this CommonJS package resolves to the module.exports namespace object, not the // class — so `new MsgReaderImport()` throws "is not a constructor". Vitest/tsx // hide this via __esModule interop. Pick the real constructor for both worlds. const MsgReader = ( typeof MsgReaderImport === 'function' ? MsgReaderImport : (MsgReaderImport as unknown as { default: typeof MsgReaderImport }).default ) as typeof MsgReaderImport; type MsgReaderInstance = InstanceType; const DEFAULT_MSG_MAX_SIZE_MB = 25; export interface MsgAddress { name?: string; email?: string; } export interface MsgAttachmentMeta { fileName: string; contentLength?: number; /** Relative path the attachment was written to (when saved). */ savedPath?: string; /** Reason the attachment was not saved (mutually exclusive with savedPath). */ skipped?: string; } export interface MsgView { subject?: string; from?: MsgAddress; to: MsgAddress[]; cc: MsgAddress[]; date?: string; body: { text: string; format: 'plain' | 'html' | 'none' }; attachments: MsgAttachmentMeta[]; } /** Render a single address as `Name `, falling back gracefully. */ export function formatAddress(a: MsgAddress): string { const name = a.name?.trim(); const email = a.email?.trim(); if (name && email) return `${name} <${email}>`; if (name) return name; if (email) return email; return '(unknown)'; } const NAMED_ENTITIES: Record = { ' ': ' ', '&': '&', '<': '<', '>': '>', '"': '"', ''': "'", ''': "'", }; // Decode a numeric character reference, preserving the original entity if the // code point is out of range (broken email HTML must not crash the whole read). function safeFromCodePoint(code: number, original: string): string { if (!Number.isFinite(code) || code < 0 || code > 0x10ffff || (code >= 0xd800 && code <= 0xdfff)) { return original; } try { return String.fromCodePoint(code); } catch { return original; } } function decodeEntities(s: string): string { let out = s.replace(/ |&|<|>|"|'|'/g, (m) => NAMED_ENTITIES[m]); out = out.replace(/&#(\d+);/g, (m, code) => safeFromCodePoint(Number(code), m)); out = out.replace(/&#x([0-9a-fA-F]+);/g, (m, code) => safeFromCodePoint(parseInt(code, 16), m)); return out; } /** Convert an HTML fragment into readable plain text. */ export function stripHtml(html: string): string { let s = html; // Drop script/style blocks including their contents. s = s.replace(/<(script|style)\b[^>]*>[\s\S]*?<\/\1>/gi, ''); // Treat
and block-level boundaries as newlines. s = s.replace(//gi, '\n'); s = s.replace(/<\/(p|div|li|tr|h[1-6]|ul|ol|table|blockquote|section|article)\s*>/gi, '\n'); // Remove all remaining tags. s = s.replace(/<[^>]+>/g, ''); s = decodeEntities(s); // Normalize whitespace: collapse intra-line runs, trim each line, collapse blank runs. s = s .split('\n') .map((line) => line.replace(/[^\S\n]+/g, ' ').trim()) .join('\n') .replace(/\n{3,}/g, '\n\n') .trim(); return s; } /** Choose the best available body text, preferring plain over HTML. */ export function selectMsgBody(fields: { body?: string; bodyHtml?: string; // PidTagHtml: some HTML-only messages carry the body here as raw bytes. html?: Uint8Array | string; }): { text: string; format: 'plain' | 'html' | 'none'; } { const plain = fields.body?.trim(); if (plain) return { text: plain, format: 'plain' }; // Try each HTML source in order; an empty/whitespace bodyHtml must not block // the PidTagHtml fallback, so we check the stripped result of each. const htmlSources = [ fields.bodyHtml, fields.html != null ? typeof fields.html === 'string' ? fields.html : Buffer.from(fields.html).toString('utf8') : undefined, ]; for (const source of htmlSources) { if (!source) continue; const stripped = stripHtml(source); if (stripped) return { text: stripped, format: 'html' }; } return { text: '', format: 'none' }; } /** * Pick the most usable email address from candidates, preferring a real SMTP * address (contains '@') over an Exchange legacy EX DN (`/O=.../CN=...`). */ export function pickEmail(...candidates: (string | undefined)[]): string | undefined { const valid = candidates.map((c) => c?.trim()).filter((c): c is string => !!c); return valid.find((c) => c.includes('@')) ?? valid[0]; } /** Reduce an attachment name to a safe basename, never escaping the target dir. */ export function sanitizeAttachmentName(name: string, index: number): string { // Take the last path segment across both separators (defends path traversal). const base = name.split(/[/\\]/).pop() ?? ''; // Strip control characters and null bytes. // eslint-disable-next-line no-control-regex const cleaned = base.replace(/[\x00-\x1f\x7f]/g, '').trim(); // Reject names that are empty or consist only of dots/spaces. if (!cleaned || /^[.\s]*$/.test(cleaned)) { return `attachment-${index + 1}`; } return cleaned; } /** Find a filename that collides with neither this run nor existing files on disk. */ function resolveFreeName(dir: string, name: string, used: Set): string { const ext = path.extname(name); const stem = name.slice(0, name.length - ext.length); let candidate = name; let n = 1; while (used.has(candidate) || fs.existsSync(path.join(dir, candidate))) { candidate = `${stem}-${n}${ext}`; n += 1; } return candidate; } /** Build the human-readable text output for a parsed message. */ export function formatMsgOutput(view: MsgView): string { const lines: string[] = []; if (view.subject) lines.push(`Subject: ${view.subject}`); if (view.from) lines.push(`From: ${formatAddress(view.from)}`); if (view.to.length) lines.push(`To: ${view.to.map(formatAddress).join(', ')}`); if (view.cc.length) lines.push(`Cc: ${view.cc.map(formatAddress).join(', ')}`); if (view.date) lines.push(`Date: ${view.date}`); const parts: string[] = [lines.join('\n')]; parts.push(view.body.format === 'none' ? '(no text body)' : view.body.text); if (view.attachments.length) { const attLines = [`Attachments (${view.attachments.length}):`]; for (const att of view.attachments) { if (att.savedPath) { const size = att.contentLength != null ? ` (${att.contentLength} bytes)` : ''; attLines.push(`- ${att.fileName}${size} -> ${att.savedPath}`); } else { attLines.push(`- ${att.fileName} - skipped: ${att.skipped ?? 'not saved'}`); } } parts.push(attLines.join('\n')); } return parts.join('\n\n'); } /** * Assemble the final output, truncating ONLY the body to the token budget. * Headers and the attachment list (with saved input/ paths) always survive — * attachments are already written to disk and the caller needs their paths. */ export function assembleMsgOutput(view: MsgView, budgetTokens: number, sourceLabel: string): string { const shell = formatMsgOutput({ ...view, body: { text: '', format: 'plain' } }); const reserveTokens = Math.ceil(shell.length / 4) + 64; const bodyBudget = Math.max(500, budgetTokens - reserveTokens); const bodyText = view.body.format === 'none' ? '' : view.body.text; const truncatedBody = truncateToBudget(bodyText, bodyBudget, { sourceLabel }).text; return formatMsgOutput({ ...view, body: { text: truncatedBody, format: view.body.format } }); } /** * msgreader's getFileData() returns `{ error: 'Unsupported file type!' }` (not a * throw) for CFBF files that aren't Outlook messages (legacy .doc/.xls, broken * compound files). Treat anything whose root isn't a 'msg' as a read failure. */ export function isParsedMsgValid(fields: { error?: string; dataType?: string | null }): boolean { return !fields.error && fields.dataType === 'msg'; } export const READ_MSG_DEF: ToolDef = { type: 'function', function: { name: 'ReadMsg', description: 'Read an Outlook .msg email file, extracting subject/sender/recipients/body and saving attachments to input/. 詳細は ReadToolDoc({ name: "ReadMsg" }) で取得可能。', parameters: { type: 'object', properties: { file_path: { type: 'string', description: 'Path to the .msg file' }, }, required: ['file_path'], }, }, }; interface FieldsLike { error?: string; dataType?: string | null; subject?: string; senderName?: string; senderEmail?: string; senderSmtpAddress?: string; body?: string; bodyHtml?: string; html?: Uint8Array; messageDeliveryTime?: string; clientSubmitTime?: string; recipients?: { name?: string; email?: string; smtpAddress?: string; recipType?: string }[]; attachments?: { fileName?: string; fileNameShort?: string; contentLength?: number; innerMsgContent?: boolean; dataType?: string | null; }[]; } export async function executeReadMsg( input: Record, ctx: ToolContext, ): Promise { const filePath = String(input.file_path ?? ''); if (!filePath) { return { output: 'ReadMsg: file_path is required', isError: true }; } let resolved: string; try { resolved = resolveAndGuard(ctx.workspacePath, filePath); } catch (e) { return { output: `ReadMsg: ${(e as Error).message}`, isError: true }; } // Enforce a size cap before loading the whole file into memory (matches the // other office tools, which each guard against oversized inputs). const maxMb = typeof ctx.toolsConfig?.officeMsgMaxSizeMb === 'number' && Number.isFinite(ctx.toolsConfig.officeMsgMaxSizeMb) && ctx.toolsConfig.officeMsgMaxSizeMb > 0 ? ctx.toolsConfig.officeMsgMaxSizeMb : DEFAULT_MSG_MAX_SIZE_MB; try { const sizeMb = fs.statSync(resolved).size / 1024 / 1024; if (sizeMb > maxMb) { return { output: `ReadMsg: file size ${sizeMb.toFixed(1)}MB exceeds limit of ${maxMb}MB`, isError: true, }; } } catch (e) { return { output: `ReadMsg: cannot stat file: ${(e as Error).message}`, isError: true }; } let buffer: Buffer; try { buffer = fs.readFileSync(resolved); } catch (e) { return { output: `ReadMsg: cannot read file: ${(e as Error).message}`, isError: true }; } // .msg is an OLE2 / CFBF compound file. Validate the magic header up front: // MsgReader silently returns an empty result for non-CFBF data instead of throwing. const CFBF_MAGIC = Buffer.from([0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1]); if (buffer.length < 8 || !buffer.subarray(0, 8).equals(CFBF_MAGIC)) { return { output: `ReadMsg: not a valid Outlook .msg file (bad signature): ${path.basename(resolved)}`, isError: true, }; } let reader: MsgReaderInstance; let fields: FieldsLike; try { // Copy into a standalone ArrayBuffer (MsgReader rejects Node Buffers). const arrayBuffer = new Uint8Array(buffer).buffer; reader = new MsgReader(arrayBuffer); fields = reader.getFileData() as unknown as FieldsLike; } catch (e) { return { output: `ReadMsg: failed to parse .msg (is this a valid Outlook message?): ${(e as Error).message}`, isError: true, }; } if (!isParsedMsgValid(fields)) { return { output: `ReadMsg: not a parseable Outlook message${fields.error ? ` (${fields.error})` : ''}: ${path.basename(resolved)}`, isError: true, }; } const recipients = fields.recipients ?? []; const to = recipients .filter((r) => (r.recipType ?? 'to') === 'to') .map((r) => ({ name: r.name, email: pickEmail(r.smtpAddress, r.email) })); const cc = recipients .filter((r) => r.recipType === 'cc') .map((r) => ({ name: r.name, email: pickEmail(r.smtpAddress, r.email) })); const inputDir = path.join(ctx.workspacePath, 'input'); const attachments: MsgAttachmentMeta[] = []; const rawAttachments = fields.attachments ?? []; const usedNames = new Set(); rawAttachments.forEach((att, i) => { const rawName = att.fileName || att.fileNameShort || ''; const baseName = sanitizeAttachmentName(rawName, i); // Read-only movements (verify etc.) must not mutate the workspace. if (!ctx.editAllowed) { attachments.push({ fileName: baseName, contentLength: att.contentLength, skipped: 'read-only phase (attachment not saved)', }); return; } if (att.innerMsgContent) { attachments.push({ fileName: baseName, contentLength: att.contentLength, skipped: 'embedded message (open separately)', }); return; } // Resolve a name that collides with neither an earlier attachment this run // nor a file already present in input/ (user uploads, prior extractions). const name = resolveFreeName(inputDir, baseName, usedNames); usedNames.add(name); try { const data = reader.getAttachment(att as never); fs.mkdirSync(inputDir, { recursive: true }); const dest = path.join(inputDir, name); fs.writeFileSync(dest, Buffer.from(data.content)); attachments.push({ fileName: name, contentLength: data.content.length, savedPath: path.join('input', name), }); } catch (e) { logger.warn(`[ReadMsg] failed to save attachment ${name}: ${(e as Error).message}`); attachments.push({ fileName: name, contentLength: att.contentLength, skipped: `extraction failed: ${(e as Error).message}`, }); } }); const view: MsgView = { subject: fields.subject, from: (() => { const email = pickEmail(fields.senderSmtpAddress, fields.senderEmail); return fields.senderName || email ? { name: fields.senderName, email } : undefined; })(), to, cc, date: fields.messageDeliveryTime || fields.clientSubmitTime, body: selectMsgBody(fields), attachments, }; const output = assembleMsgOutput(view, getToolOutputBudgetTokens(ctx), path.basename(resolved)); logger.info( `[ReadMsg] ${path.basename(resolved)}: attachments=${attachments.length} bodyFormat=${view.body.format}`, ); return { output, isError: false }; }