maestro/src/engine/tools/speech.ts
2026-06-03 05:08:00 +00:00

162 lines
5.2 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import * as fs from 'fs';
import * as path from 'path';
import { ToolDef } from '../../llm/openai-compat.js';
import type { ToolContext, ToolResult } from './core.js';
import { resolveAndGuard } from './core.js';
import { logger } from '../../logger.js';
const SUPPORTED_EXTENSIONS = new Set(['mp3', 'wav']);
const TRANSCRIBE_AUDIO_DEF: ToolDef = {
type: 'function',
function: {
name: 'TranscribeAudio',
description: '音声ファイルmp3/wavを文字起こしする話者分離対応、外部音声認識サーバーへ送信。詳細は ReadToolDoc({ name: "TranscribeAudio" })。',
parameters: {
type: 'object',
properties: {
file_path: { type: 'string', description: 'workspace 内の音声ファイルパスmp3, wav' },
language: { type: 'string', description: '言語コード(省略時: config の speech_language or "ja"' },
diarize: { type: 'boolean', description: '話者分離を有効にする(省略時: true' },
prompt: { type: 'string', description: '文字起こしヒント(固有名詞・専門用語等)' },
},
required: ['file_path'],
},
},
};
export const TOOL_DEFS: Record<string, ToolDef> = {
TranscribeAudio: TRANSCRIBE_AUDIO_DEF,
};
interface TranscriptionSegment {
text: string;
speaker?: string;
}
interface TranscriptionResponse {
text?: string;
segments?: TranscriptionSegment[];
}
function formatTranscription(response: TranscriptionResponse, diarize: boolean): string {
if (!diarize || !response.segments?.length) {
if (response.segments?.length) {
return response.segments.map(s => s.text).join('');
}
return response.text ?? '';
}
const lines: string[] = [];
let currentSpeaker: string | undefined;
let currentText = '';
for (const seg of response.segments) {
const speaker = seg.speaker ?? 'Unknown';
if (speaker !== currentSpeaker) {
if (currentText) {
lines.push(`[${currentSpeaker}] ${currentText.trim()}`);
}
currentSpeaker = speaker;
currentText = seg.text;
} else {
currentText += seg.text;
}
}
if (currentText && currentSpeaker) {
lines.push(`[${currentSpeaker}] ${currentText.trim()}`);
}
return lines.join('\n');
}
export async function executeTool(
name: string,
input: Record<string, unknown>,
ctx: ToolContext,
): Promise<ToolResult | null> {
if (name !== 'TranscribeAudio') return null;
const filePath = input.file_path as string | undefined;
if (!filePath) {
return { output: 'file_path は必須です', isError: true };
}
const serverUrl = ctx.toolsConfig?.speechServerUrl;
if (!serverUrl) {
return { output: 'speech_server_url が config.yaml に未設定です', isError: true };
}
const resolved = resolveAndGuard(ctx.workspacePath, filePath);
const ext = path.extname(resolved).toLowerCase().replace('.', '');
if (!SUPPORTED_EXTENSIONS.has(ext)) {
return { output: `対応フォーマット: mp3, wav指定: .${ext}`, isError: true };
}
if (!fs.existsSync(resolved)) {
return { output: `ファイルが見つかりません: ${filePath}`, isError: true };
}
const language = (input.language as string) ?? ctx.toolsConfig?.speechLanguage ?? 'ja';
const diarize = input.diarize !== false;
const prompt = input.prompt as string | undefined;
const timeout = (ctx.toolsConfig?.speechTimeout ?? 300) * 1000;
try {
const fileBuffer = fs.readFileSync(resolved);
const fileName = path.basename(resolved);
const mimeType = ext === 'mp3' ? 'audio/mpeg' : 'audio/wav';
const blob = new Blob([fileBuffer], { type: mimeType });
const formData = new FormData();
formData.append('file', blob, fileName);
formData.append('language', language);
formData.append('response_format', 'verbose_json');
if (prompt) {
formData.append('prompt', prompt);
}
const headers: Record<string, string> = {};
if (diarize) {
headers['X-Diarize'] = 'true';
}
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeout);
const url = `${serverUrl.replace(/\/+$/, '')}/audio/transcriptions`;
const response = await fetch(url, {
method: 'POST',
body: formData,
headers,
signal: controller.signal,
});
clearTimeout(timer);
if (!response.ok) {
const errText = await response.text().catch(() => '');
return {
output: `音声認識サーバーエラー (${response.status}): ${errText}`.slice(0, 2000),
isError: true,
};
}
const data = await response.json() as TranscriptionResponse;
const text = formatTranscription(data, diarize);
if (!text) {
return { output: '文字起こし結果が空です', isError: true };
}
const header = `## 文字起こし結果: ${fileName}\n\n`;
return { output: header + text, isError: false };
} catch (err: unknown) {
if (err instanceof Error && err.name === 'AbortError') {
return { output: `タイムアウト(${timeout / 1000}秒): ${filePath}`, isError: true };
}
const msg = err instanceof Error ? err.message : String(err);
return { output: `音声認識サーバーに接続できません: ${serverUrl} (${msg})`, isError: true };
}
}