162 lines
5.2 KiB
TypeScript
162 lines
5.2 KiB
TypeScript
import * as fs from 'fs';
|
||
import * as path from 'path';
|
||
import { ToolDef } from '../../llm/openai-compat.js';
|
||
import type { ToolContext, ToolResult } from './core.js';
|
||
import { resolveAndGuard } from './core.js';
|
||
import { logger } from '../../logger.js';
|
||
|
||
const SUPPORTED_EXTENSIONS = new Set(['mp3', 'wav']);
|
||
|
||
const TRANSCRIBE_AUDIO_DEF: ToolDef = {
|
||
type: 'function',
|
||
function: {
|
||
name: 'TranscribeAudio',
|
||
description: '音声ファイル(mp3/wav)を文字起こしする(話者分離対応、外部音声認識サーバーへ送信)。詳細は ReadToolDoc({ name: "TranscribeAudio" })。',
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
file_path: { type: 'string', description: 'workspace 内の音声ファイルパス(mp3, wav)' },
|
||
language: { type: 'string', description: '言語コード(省略時: config の speech_language or "ja")' },
|
||
diarize: { type: 'boolean', description: '話者分離を有効にする(省略時: true)' },
|
||
prompt: { type: 'string', description: '文字起こしヒント(固有名詞・専門用語等)' },
|
||
},
|
||
required: ['file_path'],
|
||
},
|
||
},
|
||
};
|
||
|
||
export const TOOL_DEFS: Record<string, ToolDef> = {
|
||
TranscribeAudio: TRANSCRIBE_AUDIO_DEF,
|
||
};
|
||
|
||
interface TranscriptionSegment {
|
||
text: string;
|
||
speaker?: string;
|
||
}
|
||
|
||
interface TranscriptionResponse {
|
||
text?: string;
|
||
segments?: TranscriptionSegment[];
|
||
}
|
||
|
||
function formatTranscription(response: TranscriptionResponse, diarize: boolean): string {
|
||
if (!diarize || !response.segments?.length) {
|
||
if (response.segments?.length) {
|
||
return response.segments.map(s => s.text).join('');
|
||
}
|
||
return response.text ?? '';
|
||
}
|
||
|
||
const lines: string[] = [];
|
||
let currentSpeaker: string | undefined;
|
||
let currentText = '';
|
||
|
||
for (const seg of response.segments) {
|
||
const speaker = seg.speaker ?? 'Unknown';
|
||
if (speaker !== currentSpeaker) {
|
||
if (currentText) {
|
||
lines.push(`[${currentSpeaker}] ${currentText.trim()}`);
|
||
}
|
||
currentSpeaker = speaker;
|
||
currentText = seg.text;
|
||
} else {
|
||
currentText += seg.text;
|
||
}
|
||
}
|
||
if (currentText && currentSpeaker) {
|
||
lines.push(`[${currentSpeaker}] ${currentText.trim()}`);
|
||
}
|
||
|
||
return lines.join('\n');
|
||
}
|
||
|
||
export async function executeTool(
|
||
name: string,
|
||
input: Record<string, unknown>,
|
||
ctx: ToolContext,
|
||
): Promise<ToolResult | null> {
|
||
if (name !== 'TranscribeAudio') return null;
|
||
|
||
const filePath = input.file_path as string | undefined;
|
||
if (!filePath) {
|
||
return { output: 'file_path は必須です', isError: true };
|
||
}
|
||
|
||
const serverUrl = ctx.toolsConfig?.speechServerUrl;
|
||
if (!serverUrl) {
|
||
return { output: 'speech_server_url が config.yaml に未設定です', isError: true };
|
||
}
|
||
|
||
const resolved = resolveAndGuard(ctx.workspacePath, filePath);
|
||
const ext = path.extname(resolved).toLowerCase().replace('.', '');
|
||
if (!SUPPORTED_EXTENSIONS.has(ext)) {
|
||
return { output: `対応フォーマット: mp3, wav(指定: .${ext})`, isError: true };
|
||
}
|
||
|
||
if (!fs.existsSync(resolved)) {
|
||
return { output: `ファイルが見つかりません: ${filePath}`, isError: true };
|
||
}
|
||
|
||
const language = (input.language as string) ?? ctx.toolsConfig?.speechLanguage ?? 'ja';
|
||
const diarize = input.diarize !== false;
|
||
const prompt = input.prompt as string | undefined;
|
||
const timeout = (ctx.toolsConfig?.speechTimeout ?? 300) * 1000;
|
||
|
||
try {
|
||
const fileBuffer = fs.readFileSync(resolved);
|
||
const fileName = path.basename(resolved);
|
||
const mimeType = ext === 'mp3' ? 'audio/mpeg' : 'audio/wav';
|
||
|
||
const blob = new Blob([fileBuffer], { type: mimeType });
|
||
|
||
const formData = new FormData();
|
||
formData.append('file', blob, fileName);
|
||
formData.append('language', language);
|
||
formData.append('response_format', 'verbose_json');
|
||
if (prompt) {
|
||
formData.append('prompt', prompt);
|
||
}
|
||
|
||
const headers: Record<string, string> = {};
|
||
if (diarize) {
|
||
headers['X-Diarize'] = 'true';
|
||
}
|
||
|
||
const controller = new AbortController();
|
||
const timer = setTimeout(() => controller.abort(), timeout);
|
||
|
||
const url = `${serverUrl.replace(/\/+$/, '')}/audio/transcriptions`;
|
||
const response = await fetch(url, {
|
||
method: 'POST',
|
||
body: formData,
|
||
headers,
|
||
signal: controller.signal,
|
||
});
|
||
clearTimeout(timer);
|
||
|
||
if (!response.ok) {
|
||
const errText = await response.text().catch(() => '');
|
||
return {
|
||
output: `音声認識サーバーエラー (${response.status}): ${errText}`.slice(0, 2000),
|
||
isError: true,
|
||
};
|
||
}
|
||
|
||
const data = await response.json() as TranscriptionResponse;
|
||
const text = formatTranscription(data, diarize);
|
||
|
||
if (!text) {
|
||
return { output: '文字起こし結果が空です', isError: true };
|
||
}
|
||
|
||
const header = `## 文字起こし結果: ${fileName}\n\n`;
|
||
return { output: header + text, isError: false };
|
||
} catch (err: unknown) {
|
||
if (err instanceof Error && err.name === 'AbortError') {
|
||
return { output: `タイムアウト(${timeout / 1000}秒): ${filePath}`, isError: true };
|
||
}
|
||
const msg = err instanceof Error ? err.message : String(err);
|
||
return { output: `音声認識サーバーに接続できません: ${serverUrl} (${msg})`, isError: true };
|
||
}
|
||
}
|