oss-sync 3b1645cc91
Some checks failed
CI / build-and-test (push) Has been cancelled
sync: update from private repo (d31b280)
2026-06-11 11:28:40 +00:00

417 lines
14 KiB
TypeScript

import * as fs from 'fs';
import * as path from 'path';
import MsgReaderImport from '@kenjiuno/msgreader';
import { ToolDef } from '../../llm/openai-compat.js';
import type { ToolContext, ToolResult } from './core.js';
import { resolveAndGuard, truncateToBudget, getToolOutputBudgetTokens } from './core.js';
import { logger } from '../../logger.js';
// CJS/ESM interop: under native Node ESM (the built dist), a default import of
// this CommonJS package resolves to the module.exports namespace object, not the
// class — so `new MsgReaderImport()` throws "is not a constructor". Vitest/tsx
// hide this via __esModule interop. Pick the real constructor for both worlds.
const MsgReader = (
typeof MsgReaderImport === 'function'
? MsgReaderImport
: (MsgReaderImport as unknown as { default: typeof MsgReaderImport }).default
) as typeof MsgReaderImport;
type MsgReaderInstance = InstanceType<typeof MsgReader>;
const DEFAULT_MSG_MAX_SIZE_MB = 25;
export interface MsgAddress {
name?: string;
email?: string;
}
export interface MsgAttachmentMeta {
fileName: string;
contentLength?: number;
/** Relative path the attachment was written to (when saved). */
savedPath?: string;
/** Reason the attachment was not saved (mutually exclusive with savedPath). */
skipped?: string;
}
export interface MsgView {
subject?: string;
from?: MsgAddress;
to: MsgAddress[];
cc: MsgAddress[];
date?: string;
body: { text: string; format: 'plain' | 'html' | 'none' };
attachments: MsgAttachmentMeta[];
}
/** Render a single address as `Name <email>`, falling back gracefully. */
export function formatAddress(a: MsgAddress): string {
const name = a.name?.trim();
const email = a.email?.trim();
if (name && email) return `${name} <${email}>`;
if (name) return name;
if (email) return email;
return '(unknown)';
}
const NAMED_ENTITIES: Record<string, string> = {
'&nbsp;': ' ',
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&#39;': "'",
'&apos;': "'",
};
// Decode a numeric character reference, preserving the original entity if the
// code point is out of range (broken email HTML must not crash the whole read).
function safeFromCodePoint(code: number, original: string): string {
if (!Number.isFinite(code) || code < 0 || code > 0x10ffff || (code >= 0xd800 && code <= 0xdfff)) {
return original;
}
try {
return String.fromCodePoint(code);
} catch {
return original;
}
}
function decodeEntities(s: string): string {
let out = s.replace(/&nbsp;|&amp;|&lt;|&gt;|&quot;|&#39;|&apos;/g, (m) => NAMED_ENTITIES[m]);
out = out.replace(/&#(\d+);/g, (m, code) => safeFromCodePoint(Number(code), m));
out = out.replace(/&#x([0-9a-fA-F]+);/g, (m, code) => safeFromCodePoint(parseInt(code, 16), m));
return out;
}
/** Convert an HTML fragment into readable plain text. */
export function stripHtml(html: string): string {
let s = html;
// Drop script/style blocks including their contents.
s = s.replace(/<(script|style)\b[^>]*>[\s\S]*?<\/\1>/gi, '');
// Treat <br> and block-level boundaries as newlines.
s = s.replace(/<br\s*\/?>/gi, '\n');
s = s.replace(/<\/(p|div|li|tr|h[1-6]|ul|ol|table|blockquote|section|article)\s*>/gi, '\n');
// Remove all remaining tags.
s = s.replace(/<[^>]+>/g, '');
s = decodeEntities(s);
// Normalize whitespace: collapse intra-line runs, trim each line, collapse blank runs.
s = s
.split('\n')
.map((line) => line.replace(/[^\S\n]+/g, ' ').trim())
.join('\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
return s;
}
/** Choose the best available body text, preferring plain over HTML. */
export function selectMsgBody(fields: {
body?: string;
bodyHtml?: string;
// PidTagHtml: some HTML-only messages carry the body here as raw bytes.
html?: Uint8Array | string;
}): {
text: string;
format: 'plain' | 'html' | 'none';
} {
const plain = fields.body?.trim();
if (plain) return { text: plain, format: 'plain' };
// Try each HTML source in order; an empty/whitespace bodyHtml must not block
// the PidTagHtml fallback, so we check the stripped result of each.
const htmlSources = [
fields.bodyHtml,
fields.html != null
? typeof fields.html === 'string'
? fields.html
: Buffer.from(fields.html).toString('utf8')
: undefined,
];
for (const source of htmlSources) {
if (!source) continue;
const stripped = stripHtml(source);
if (stripped) return { text: stripped, format: 'html' };
}
return { text: '', format: 'none' };
}
/**
* Pick the most usable email address from candidates, preferring a real SMTP
* address (contains '@') over an Exchange legacy EX DN (`/O=.../CN=...`).
*/
export function pickEmail(...candidates: (string | undefined)[]): string | undefined {
const valid = candidates.map((c) => c?.trim()).filter((c): c is string => !!c);
return valid.find((c) => c.includes('@')) ?? valid[0];
}
/** Reduce an attachment name to a safe basename, never escaping the target dir. */
export function sanitizeAttachmentName(name: string, index: number): string {
// Take the last path segment across both separators (defends path traversal).
const base = name.split(/[/\\]/).pop() ?? '';
// Strip control characters and null bytes.
// eslint-disable-next-line no-control-regex
const cleaned = base.replace(/[\x00-\x1f\x7f]/g, '').trim();
// Reject names that are empty or consist only of dots/spaces.
if (!cleaned || /^[.\s]*$/.test(cleaned)) {
return `attachment-${index + 1}`;
}
return cleaned;
}
/** Find a filename that collides with neither this run nor existing files on disk. */
function resolveFreeName(dir: string, name: string, used: Set<string>): string {
const ext = path.extname(name);
const stem = name.slice(0, name.length - ext.length);
let candidate = name;
let n = 1;
while (used.has(candidate) || fs.existsSync(path.join(dir, candidate))) {
candidate = `${stem}-${n}${ext}`;
n += 1;
}
return candidate;
}
/** Build the human-readable text output for a parsed message. */
export function formatMsgOutput(view: MsgView): string {
const lines: string[] = [];
if (view.subject) lines.push(`Subject: ${view.subject}`);
if (view.from) lines.push(`From: ${formatAddress(view.from)}`);
if (view.to.length) lines.push(`To: ${view.to.map(formatAddress).join(', ')}`);
if (view.cc.length) lines.push(`Cc: ${view.cc.map(formatAddress).join(', ')}`);
if (view.date) lines.push(`Date: ${view.date}`);
const parts: string[] = [lines.join('\n')];
parts.push(view.body.format === 'none' ? '(no text body)' : view.body.text);
if (view.attachments.length) {
const attLines = [`Attachments (${view.attachments.length}):`];
for (const att of view.attachments) {
if (att.savedPath) {
const size = att.contentLength != null ? ` (${att.contentLength} bytes)` : '';
attLines.push(`- ${att.fileName}${size} -> ${att.savedPath}`);
} else {
attLines.push(`- ${att.fileName} - skipped: ${att.skipped ?? 'not saved'}`);
}
}
parts.push(attLines.join('\n'));
}
return parts.join('\n\n');
}
/**
* Assemble the final output, truncating ONLY the body to the token budget.
* Headers and the attachment list (with saved input/ paths) always survive —
* attachments are already written to disk and the caller needs their paths.
*/
export function assembleMsgOutput(view: MsgView, budgetTokens: number, sourceLabel: string): string {
const shell = formatMsgOutput({ ...view, body: { text: '', format: 'plain' } });
const reserveTokens = Math.ceil(shell.length / 4) + 64;
const bodyBudget = Math.max(500, budgetTokens - reserveTokens);
const bodyText = view.body.format === 'none' ? '' : view.body.text;
const truncatedBody = truncateToBudget(bodyText, bodyBudget, { sourceLabel }).text;
return formatMsgOutput({ ...view, body: { text: truncatedBody, format: view.body.format } });
}
/**
* msgreader's getFileData() returns `{ error: 'Unsupported file type!' }` (not a
* throw) for CFBF files that aren't Outlook messages (legacy .doc/.xls, broken
* compound files). Treat anything whose root isn't a 'msg' as a read failure.
*/
export function isParsedMsgValid(fields: { error?: string; dataType?: string | null }): boolean {
return !fields.error && fields.dataType === 'msg';
}
export const READ_MSG_DEF: ToolDef = {
type: 'function',
function: {
name: 'ReadMsg',
description:
'Read an Outlook .msg email file, extracting subject/sender/recipients/body and saving attachments to input/. 詳細は ReadToolDoc({ name: "ReadMsg" }) で取得可能。',
parameters: {
type: 'object',
properties: {
file_path: { type: 'string', description: 'Path to the .msg file' },
},
required: ['file_path'],
},
},
};
interface FieldsLike {
error?: string;
dataType?: string | null;
subject?: string;
senderName?: string;
senderEmail?: string;
senderSmtpAddress?: string;
body?: string;
bodyHtml?: string;
html?: Uint8Array;
messageDeliveryTime?: string;
clientSubmitTime?: string;
recipients?: { name?: string; email?: string; smtpAddress?: string; recipType?: string }[];
attachments?: {
fileName?: string;
fileNameShort?: string;
contentLength?: number;
innerMsgContent?: boolean;
dataType?: string | null;
}[];
}
export async function executeReadMsg(
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult> {
const filePath = String(input.file_path ?? '');
if (!filePath) {
return { output: 'ReadMsg: file_path is required', isError: true };
}
let resolved: string;
try {
resolved = resolveAndGuard(ctx.workspacePath, filePath);
} catch (e) {
return { output: `ReadMsg: ${(e as Error).message}`, isError: true };
}
// Enforce a size cap before loading the whole file into memory (matches the
// other office tools, which each guard against oversized inputs).
const maxMb =
typeof ctx.toolsConfig?.officeMsgMaxSizeMb === 'number' &&
Number.isFinite(ctx.toolsConfig.officeMsgMaxSizeMb) &&
ctx.toolsConfig.officeMsgMaxSizeMb > 0
? ctx.toolsConfig.officeMsgMaxSizeMb
: DEFAULT_MSG_MAX_SIZE_MB;
try {
const sizeMb = fs.statSync(resolved).size / 1024 / 1024;
if (sizeMb > maxMb) {
return {
output: `ReadMsg: file size ${sizeMb.toFixed(1)}MB exceeds limit of ${maxMb}MB`,
isError: true,
};
}
} catch (e) {
return { output: `ReadMsg: cannot stat file: ${(e as Error).message}`, isError: true };
}
let buffer: Buffer;
try {
buffer = fs.readFileSync(resolved);
} catch (e) {
return { output: `ReadMsg: cannot read file: ${(e as Error).message}`, isError: true };
}
// .msg is an OLE2 / CFBF compound file. Validate the magic header up front:
// MsgReader silently returns an empty result for non-CFBF data instead of throwing.
const CFBF_MAGIC = Buffer.from([0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1]);
if (buffer.length < 8 || !buffer.subarray(0, 8).equals(CFBF_MAGIC)) {
return {
output: `ReadMsg: not a valid Outlook .msg file (bad signature): ${path.basename(resolved)}`,
isError: true,
};
}
let reader: MsgReaderInstance;
let fields: FieldsLike;
try {
// Copy into a standalone ArrayBuffer (MsgReader rejects Node Buffers).
const arrayBuffer = new Uint8Array(buffer).buffer;
reader = new MsgReader(arrayBuffer);
fields = reader.getFileData() as unknown as FieldsLike;
} catch (e) {
return {
output: `ReadMsg: failed to parse .msg (is this a valid Outlook message?): ${(e as Error).message}`,
isError: true,
};
}
if (!isParsedMsgValid(fields)) {
return {
output: `ReadMsg: not a parseable Outlook message${fields.error ? ` (${fields.error})` : ''}: ${path.basename(resolved)}`,
isError: true,
};
}
const recipients = fields.recipients ?? [];
const to = recipients
.filter((r) => (r.recipType ?? 'to') === 'to')
.map((r) => ({ name: r.name, email: pickEmail(r.smtpAddress, r.email) }));
const cc = recipients
.filter((r) => r.recipType === 'cc')
.map((r) => ({ name: r.name, email: pickEmail(r.smtpAddress, r.email) }));
const inputDir = path.join(ctx.workspacePath, 'input');
const attachments: MsgAttachmentMeta[] = [];
const rawAttachments = fields.attachments ?? [];
const usedNames = new Set<string>();
rawAttachments.forEach((att, i) => {
const rawName = att.fileName || att.fileNameShort || '';
const baseName = sanitizeAttachmentName(rawName, i);
// Read-only movements (verify etc.) must not mutate the workspace.
if (!ctx.editAllowed) {
attachments.push({
fileName: baseName,
contentLength: att.contentLength,
skipped: 'read-only phase (attachment not saved)',
});
return;
}
if (att.innerMsgContent) {
attachments.push({
fileName: baseName,
contentLength: att.contentLength,
skipped: 'embedded message (open separately)',
});
return;
}
// Resolve a name that collides with neither an earlier attachment this run
// nor a file already present in input/ (user uploads, prior extractions).
const name = resolveFreeName(inputDir, baseName, usedNames);
usedNames.add(name);
try {
const data = reader.getAttachment(att as never);
fs.mkdirSync(inputDir, { recursive: true });
const dest = path.join(inputDir, name);
fs.writeFileSync(dest, Buffer.from(data.content));
attachments.push({
fileName: name,
contentLength: data.content.length,
savedPath: path.join('input', name),
});
} catch (e) {
logger.warn(`[ReadMsg] failed to save attachment ${name}: ${(e as Error).message}`);
attachments.push({
fileName: name,
contentLength: att.contentLength,
skipped: `extraction failed: ${(e as Error).message}`,
});
}
});
const view: MsgView = {
subject: fields.subject,
from: (() => {
const email = pickEmail(fields.senderSmtpAddress, fields.senderEmail);
return fields.senderName || email ? { name: fields.senderName, email } : undefined;
})(),
to,
cc,
date: fields.messageDeliveryTime || fields.clientSubmitTime,
body: selectMsgBody(fields),
attachments,
};
const output = assembleMsgOutput(view, getToolOutputBudgetTokens(ctx), path.basename(resolved));
logger.info(
`[ReadMsg] ${path.basename(resolved)}: attachments=${attachments.length} bodyFormat=${view.body.format}`,
);
return { output, isError: false };
}