417 lines
14 KiB
TypeScript
417 lines
14 KiB
TypeScript
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import MsgReaderImport from '@kenjiuno/msgreader';
|
|
import { ToolDef } from '../../llm/openai-compat.js';
|
|
import type { ToolContext, ToolResult } from './core.js';
|
|
import { resolveAndGuard, truncateToBudget, getToolOutputBudgetTokens } from './core.js';
|
|
import { logger } from '../../logger.js';
|
|
|
|
// CJS/ESM interop: under native Node ESM (the built dist), a default import of
|
|
// this CommonJS package resolves to the module.exports namespace object, not the
|
|
// class — so `new MsgReaderImport()` throws "is not a constructor". Vitest/tsx
|
|
// hide this via __esModule interop. Pick the real constructor for both worlds.
|
|
const MsgReader = (
|
|
typeof MsgReaderImport === 'function'
|
|
? MsgReaderImport
|
|
: (MsgReaderImport as unknown as { default: typeof MsgReaderImport }).default
|
|
) as typeof MsgReaderImport;
|
|
type MsgReaderInstance = InstanceType<typeof MsgReader>;
|
|
|
|
const DEFAULT_MSG_MAX_SIZE_MB = 25;
|
|
|
|
export interface MsgAddress {
|
|
name?: string;
|
|
email?: string;
|
|
}
|
|
|
|
export interface MsgAttachmentMeta {
|
|
fileName: string;
|
|
contentLength?: number;
|
|
/** Relative path the attachment was written to (when saved). */
|
|
savedPath?: string;
|
|
/** Reason the attachment was not saved (mutually exclusive with savedPath). */
|
|
skipped?: string;
|
|
}
|
|
|
|
export interface MsgView {
|
|
subject?: string;
|
|
from?: MsgAddress;
|
|
to: MsgAddress[];
|
|
cc: MsgAddress[];
|
|
date?: string;
|
|
body: { text: string; format: 'plain' | 'html' | 'none' };
|
|
attachments: MsgAttachmentMeta[];
|
|
}
|
|
|
|
/** Render a single address as `Name <email>`, falling back gracefully. */
|
|
export function formatAddress(a: MsgAddress): string {
|
|
const name = a.name?.trim();
|
|
const email = a.email?.trim();
|
|
if (name && email) return `${name} <${email}>`;
|
|
if (name) return name;
|
|
if (email) return email;
|
|
return '(unknown)';
|
|
}
|
|
|
|
const NAMED_ENTITIES: Record<string, string> = {
|
|
' ': ' ',
|
|
'&': '&',
|
|
'<': '<',
|
|
'>': '>',
|
|
'"': '"',
|
|
''': "'",
|
|
''': "'",
|
|
};
|
|
|
|
// Decode a numeric character reference, preserving the original entity if the
|
|
// code point is out of range (broken email HTML must not crash the whole read).
|
|
function safeFromCodePoint(code: number, original: string): string {
|
|
if (!Number.isFinite(code) || code < 0 || code > 0x10ffff || (code >= 0xd800 && code <= 0xdfff)) {
|
|
return original;
|
|
}
|
|
try {
|
|
return String.fromCodePoint(code);
|
|
} catch {
|
|
return original;
|
|
}
|
|
}
|
|
|
|
function decodeEntities(s: string): string {
|
|
let out = s.replace(/ |&|<|>|"|'|'/g, (m) => NAMED_ENTITIES[m]);
|
|
out = out.replace(/&#(\d+);/g, (m, code) => safeFromCodePoint(Number(code), m));
|
|
out = out.replace(/&#x([0-9a-fA-F]+);/g, (m, code) => safeFromCodePoint(parseInt(code, 16), m));
|
|
return out;
|
|
}
|
|
|
|
/** Convert an HTML fragment into readable plain text. */
|
|
export function stripHtml(html: string): string {
|
|
let s = html;
|
|
// Drop script/style blocks including their contents.
|
|
s = s.replace(/<(script|style)\b[^>]*>[\s\S]*?<\/\1>/gi, '');
|
|
// Treat <br> and block-level boundaries as newlines.
|
|
s = s.replace(/<br\s*\/?>/gi, '\n');
|
|
s = s.replace(/<\/(p|div|li|tr|h[1-6]|ul|ol|table|blockquote|section|article)\s*>/gi, '\n');
|
|
// Remove all remaining tags.
|
|
s = s.replace(/<[^>]+>/g, '');
|
|
s = decodeEntities(s);
|
|
// Normalize whitespace: collapse intra-line runs, trim each line, collapse blank runs.
|
|
s = s
|
|
.split('\n')
|
|
.map((line) => line.replace(/[^\S\n]+/g, ' ').trim())
|
|
.join('\n')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.trim();
|
|
return s;
|
|
}
|
|
|
|
/** Choose the best available body text, preferring plain over HTML. */
|
|
export function selectMsgBody(fields: {
|
|
body?: string;
|
|
bodyHtml?: string;
|
|
// PidTagHtml: some HTML-only messages carry the body here as raw bytes.
|
|
html?: Uint8Array | string;
|
|
}): {
|
|
text: string;
|
|
format: 'plain' | 'html' | 'none';
|
|
} {
|
|
const plain = fields.body?.trim();
|
|
if (plain) return { text: plain, format: 'plain' };
|
|
// Try each HTML source in order; an empty/whitespace bodyHtml must not block
|
|
// the PidTagHtml fallback, so we check the stripped result of each.
|
|
const htmlSources = [
|
|
fields.bodyHtml,
|
|
fields.html != null
|
|
? typeof fields.html === 'string'
|
|
? fields.html
|
|
: Buffer.from(fields.html).toString('utf8')
|
|
: undefined,
|
|
];
|
|
for (const source of htmlSources) {
|
|
if (!source) continue;
|
|
const stripped = stripHtml(source);
|
|
if (stripped) return { text: stripped, format: 'html' };
|
|
}
|
|
return { text: '', format: 'none' };
|
|
}
|
|
|
|
/**
|
|
* Pick the most usable email address from candidates, preferring a real SMTP
|
|
* address (contains '@') over an Exchange legacy EX DN (`/O=.../CN=...`).
|
|
*/
|
|
export function pickEmail(...candidates: (string | undefined)[]): string | undefined {
|
|
const valid = candidates.map((c) => c?.trim()).filter((c): c is string => !!c);
|
|
return valid.find((c) => c.includes('@')) ?? valid[0];
|
|
}
|
|
|
|
/** Reduce an attachment name to a safe basename, never escaping the target dir. */
|
|
export function sanitizeAttachmentName(name: string, index: number): string {
|
|
// Take the last path segment across both separators (defends path traversal).
|
|
const base = name.split(/[/\\]/).pop() ?? '';
|
|
// Strip control characters and null bytes.
|
|
// eslint-disable-next-line no-control-regex
|
|
const cleaned = base.replace(/[\x00-\x1f\x7f]/g, '').trim();
|
|
// Reject names that are empty or consist only of dots/spaces.
|
|
if (!cleaned || /^[.\s]*$/.test(cleaned)) {
|
|
return `attachment-${index + 1}`;
|
|
}
|
|
return cleaned;
|
|
}
|
|
|
|
/** Find a filename that collides with neither this run nor existing files on disk. */
|
|
function resolveFreeName(dir: string, name: string, used: Set<string>): string {
|
|
const ext = path.extname(name);
|
|
const stem = name.slice(0, name.length - ext.length);
|
|
let candidate = name;
|
|
let n = 1;
|
|
while (used.has(candidate) || fs.existsSync(path.join(dir, candidate))) {
|
|
candidate = `${stem}-${n}${ext}`;
|
|
n += 1;
|
|
}
|
|
return candidate;
|
|
}
|
|
|
|
/** Build the human-readable text output for a parsed message. */
|
|
export function formatMsgOutput(view: MsgView): string {
|
|
const lines: string[] = [];
|
|
if (view.subject) lines.push(`Subject: ${view.subject}`);
|
|
if (view.from) lines.push(`From: ${formatAddress(view.from)}`);
|
|
if (view.to.length) lines.push(`To: ${view.to.map(formatAddress).join(', ')}`);
|
|
if (view.cc.length) lines.push(`Cc: ${view.cc.map(formatAddress).join(', ')}`);
|
|
if (view.date) lines.push(`Date: ${view.date}`);
|
|
|
|
const parts: string[] = [lines.join('\n')];
|
|
|
|
parts.push(view.body.format === 'none' ? '(no text body)' : view.body.text);
|
|
|
|
if (view.attachments.length) {
|
|
const attLines = [`Attachments (${view.attachments.length}):`];
|
|
for (const att of view.attachments) {
|
|
if (att.savedPath) {
|
|
const size = att.contentLength != null ? ` (${att.contentLength} bytes)` : '';
|
|
attLines.push(`- ${att.fileName}${size} -> ${att.savedPath}`);
|
|
} else {
|
|
attLines.push(`- ${att.fileName} - skipped: ${att.skipped ?? 'not saved'}`);
|
|
}
|
|
}
|
|
parts.push(attLines.join('\n'));
|
|
}
|
|
|
|
return parts.join('\n\n');
|
|
}
|
|
|
|
/**
|
|
* Assemble the final output, truncating ONLY the body to the token budget.
|
|
* Headers and the attachment list (with saved input/ paths) always survive —
|
|
* attachments are already written to disk and the caller needs their paths.
|
|
*/
|
|
export function assembleMsgOutput(view: MsgView, budgetTokens: number, sourceLabel: string): string {
|
|
const shell = formatMsgOutput({ ...view, body: { text: '', format: 'plain' } });
|
|
const reserveTokens = Math.ceil(shell.length / 4) + 64;
|
|
const bodyBudget = Math.max(500, budgetTokens - reserveTokens);
|
|
const bodyText = view.body.format === 'none' ? '' : view.body.text;
|
|
const truncatedBody = truncateToBudget(bodyText, bodyBudget, { sourceLabel }).text;
|
|
return formatMsgOutput({ ...view, body: { text: truncatedBody, format: view.body.format } });
|
|
}
|
|
|
|
/**
|
|
* msgreader's getFileData() returns `{ error: 'Unsupported file type!' }` (not a
|
|
* throw) for CFBF files that aren't Outlook messages (legacy .doc/.xls, broken
|
|
* compound files). Treat anything whose root isn't a 'msg' as a read failure.
|
|
*/
|
|
export function isParsedMsgValid(fields: { error?: string; dataType?: string | null }): boolean {
|
|
return !fields.error && fields.dataType === 'msg';
|
|
}
|
|
|
|
export const READ_MSG_DEF: ToolDef = {
|
|
type: 'function',
|
|
function: {
|
|
name: 'ReadMsg',
|
|
description:
|
|
'Read an Outlook .msg email file, extracting subject/sender/recipients/body and saving attachments to input/. 詳細は ReadToolDoc({ name: "ReadMsg" }) で取得可能。',
|
|
parameters: {
|
|
type: 'object',
|
|
properties: {
|
|
file_path: { type: 'string', description: 'Path to the .msg file' },
|
|
},
|
|
required: ['file_path'],
|
|
},
|
|
},
|
|
};
|
|
|
|
interface FieldsLike {
|
|
error?: string;
|
|
dataType?: string | null;
|
|
subject?: string;
|
|
senderName?: string;
|
|
senderEmail?: string;
|
|
senderSmtpAddress?: string;
|
|
body?: string;
|
|
bodyHtml?: string;
|
|
html?: Uint8Array;
|
|
messageDeliveryTime?: string;
|
|
clientSubmitTime?: string;
|
|
recipients?: { name?: string; email?: string; smtpAddress?: string; recipType?: string }[];
|
|
attachments?: {
|
|
fileName?: string;
|
|
fileNameShort?: string;
|
|
contentLength?: number;
|
|
innerMsgContent?: boolean;
|
|
dataType?: string | null;
|
|
}[];
|
|
}
|
|
|
|
export async function executeReadMsg(
|
|
input: Record<string, unknown>,
|
|
ctx: ToolContext,
|
|
): Promise<ToolResult> {
|
|
const filePath = String(input.file_path ?? '');
|
|
if (!filePath) {
|
|
return { output: 'ReadMsg: file_path is required', isError: true };
|
|
}
|
|
|
|
let resolved: string;
|
|
try {
|
|
resolved = resolveAndGuard(ctx.workspacePath, filePath);
|
|
} catch (e) {
|
|
return { output: `ReadMsg: ${(e as Error).message}`, isError: true };
|
|
}
|
|
|
|
// Enforce a size cap before loading the whole file into memory (matches the
|
|
// other office tools, which each guard against oversized inputs).
|
|
const maxMb =
|
|
typeof ctx.toolsConfig?.officeMsgMaxSizeMb === 'number' &&
|
|
Number.isFinite(ctx.toolsConfig.officeMsgMaxSizeMb) &&
|
|
ctx.toolsConfig.officeMsgMaxSizeMb > 0
|
|
? ctx.toolsConfig.officeMsgMaxSizeMb
|
|
: DEFAULT_MSG_MAX_SIZE_MB;
|
|
try {
|
|
const sizeMb = fs.statSync(resolved).size / 1024 / 1024;
|
|
if (sizeMb > maxMb) {
|
|
return {
|
|
output: `ReadMsg: file size ${sizeMb.toFixed(1)}MB exceeds limit of ${maxMb}MB`,
|
|
isError: true,
|
|
};
|
|
}
|
|
} catch (e) {
|
|
return { output: `ReadMsg: cannot stat file: ${(e as Error).message}`, isError: true };
|
|
}
|
|
|
|
let buffer: Buffer;
|
|
try {
|
|
buffer = fs.readFileSync(resolved);
|
|
} catch (e) {
|
|
return { output: `ReadMsg: cannot read file: ${(e as Error).message}`, isError: true };
|
|
}
|
|
|
|
// .msg is an OLE2 / CFBF compound file. Validate the magic header up front:
|
|
// MsgReader silently returns an empty result for non-CFBF data instead of throwing.
|
|
const CFBF_MAGIC = Buffer.from([0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1]);
|
|
if (buffer.length < 8 || !buffer.subarray(0, 8).equals(CFBF_MAGIC)) {
|
|
return {
|
|
output: `ReadMsg: not a valid Outlook .msg file (bad signature): ${path.basename(resolved)}`,
|
|
isError: true,
|
|
};
|
|
}
|
|
|
|
let reader: MsgReaderInstance;
|
|
let fields: FieldsLike;
|
|
try {
|
|
// Copy into a standalone ArrayBuffer (MsgReader rejects Node Buffers).
|
|
const arrayBuffer = new Uint8Array(buffer).buffer;
|
|
reader = new MsgReader(arrayBuffer);
|
|
fields = reader.getFileData() as unknown as FieldsLike;
|
|
} catch (e) {
|
|
return {
|
|
output: `ReadMsg: failed to parse .msg (is this a valid Outlook message?): ${(e as Error).message}`,
|
|
isError: true,
|
|
};
|
|
}
|
|
|
|
if (!isParsedMsgValid(fields)) {
|
|
return {
|
|
output: `ReadMsg: not a parseable Outlook message${fields.error ? ` (${fields.error})` : ''}: ${path.basename(resolved)}`,
|
|
isError: true,
|
|
};
|
|
}
|
|
|
|
const recipients = fields.recipients ?? [];
|
|
const to = recipients
|
|
.filter((r) => (r.recipType ?? 'to') === 'to')
|
|
.map((r) => ({ name: r.name, email: pickEmail(r.smtpAddress, r.email) }));
|
|
const cc = recipients
|
|
.filter((r) => r.recipType === 'cc')
|
|
.map((r) => ({ name: r.name, email: pickEmail(r.smtpAddress, r.email) }));
|
|
|
|
const inputDir = path.join(ctx.workspacePath, 'input');
|
|
const attachments: MsgAttachmentMeta[] = [];
|
|
const rawAttachments = fields.attachments ?? [];
|
|
const usedNames = new Set<string>();
|
|
|
|
rawAttachments.forEach((att, i) => {
|
|
const rawName = att.fileName || att.fileNameShort || '';
|
|
const baseName = sanitizeAttachmentName(rawName, i);
|
|
|
|
// Read-only movements (verify etc.) must not mutate the workspace.
|
|
if (!ctx.editAllowed) {
|
|
attachments.push({
|
|
fileName: baseName,
|
|
contentLength: att.contentLength,
|
|
skipped: 'read-only phase (attachment not saved)',
|
|
});
|
|
return;
|
|
}
|
|
|
|
if (att.innerMsgContent) {
|
|
attachments.push({
|
|
fileName: baseName,
|
|
contentLength: att.contentLength,
|
|
skipped: 'embedded message (open separately)',
|
|
});
|
|
return;
|
|
}
|
|
|
|
// Resolve a name that collides with neither an earlier attachment this run
|
|
// nor a file already present in input/ (user uploads, prior extractions).
|
|
const name = resolveFreeName(inputDir, baseName, usedNames);
|
|
usedNames.add(name);
|
|
|
|
try {
|
|
const data = reader.getAttachment(att as never);
|
|
fs.mkdirSync(inputDir, { recursive: true });
|
|
const dest = path.join(inputDir, name);
|
|
fs.writeFileSync(dest, Buffer.from(data.content));
|
|
attachments.push({
|
|
fileName: name,
|
|
contentLength: data.content.length,
|
|
savedPath: path.join('input', name),
|
|
});
|
|
} catch (e) {
|
|
logger.warn(`[ReadMsg] failed to save attachment ${name}: ${(e as Error).message}`);
|
|
attachments.push({
|
|
fileName: name,
|
|
contentLength: att.contentLength,
|
|
skipped: `extraction failed: ${(e as Error).message}`,
|
|
});
|
|
}
|
|
});
|
|
|
|
const view: MsgView = {
|
|
subject: fields.subject,
|
|
from: (() => {
|
|
const email = pickEmail(fields.senderSmtpAddress, fields.senderEmail);
|
|
return fields.senderName || email ? { name: fields.senderName, email } : undefined;
|
|
})(),
|
|
to,
|
|
cc,
|
|
date: fields.messageDeliveryTime || fields.clientSubmitTime,
|
|
body: selectMsgBody(fields),
|
|
attachments,
|
|
};
|
|
|
|
const output = assembleMsgOutput(view, getToolOutputBudgetTokens(ctx), path.basename(resolved));
|
|
logger.info(
|
|
`[ReadMsg] ${path.basename(resolved)}: attachments=${attachments.length} bodyFormat=${view.body.format}`,
|
|
);
|
|
return { output, isError: false };
|
|
}
|