maestro/src/engine/tools/speech.ts

import * as fs from 'fs';
import * as path from 'path';
import { ToolDef } from '../../llm/openai-compat.js';
import type { ToolContext, ToolResult } from './core.js';
import { resolveAndGuard } from './core.js';
import { logger } from '../../logger.js';

const SUPPORTED_EXTENSIONS = new Set(['mp3', 'wav']);

const TRANSCRIBE_AUDIO_DEF: ToolDef = {
  type: 'function',
  function: {
    name: 'TranscribeAudio',
    description: '音声ファイル（mp3/wav）を文字起こしする（話者分離対応、外部音声認識サーバーへ送信）。詳細は ReadToolDoc({ name: "TranscribeAudio" })。',
    parameters: {
      type: 'object',
      properties: {
        file_path: { type: 'string', description: 'workspace 内の音声ファイルパス（mp3, wav）' },
        language: { type: 'string', description: '言語コード（省略時: config の speech_language or "ja"）' },
        diarize: { type: 'boolean', description: '話者分離を有効にする（省略時: true）' },
        prompt: { type: 'string', description: '文字起こしヒント（固有名詞・専門用語等）' },
      },
      required: ['file_path'],
    },
  },
};

export const TOOL_DEFS: Record<string, ToolDef> = {
  TranscribeAudio: TRANSCRIBE_AUDIO_DEF,
};

interface TranscriptionSegment {
  text: string;
  speaker?: string;
}

interface TranscriptionResponse {
  text?: string;
  segments?: TranscriptionSegment[];
}

function formatTranscription(response: TranscriptionResponse, diarize: boolean): string {
  if (!diarize || !response.segments?.length) {
    if (response.segments?.length) {
      return response.segments.map(s => s.text).join('');
    }
    return response.text ?? '';
  }

  const lines: string[] = [];
  let currentSpeaker: string | undefined;
  let currentText = '';

  for (const seg of response.segments) {
    const speaker = seg.speaker ?? 'Unknown';
    if (speaker !== currentSpeaker) {
      if (currentText) {
        lines.push(`[${currentSpeaker}] ${currentText.trim()}`);
      }
      currentSpeaker = speaker;
      currentText = seg.text;
    } else {
      currentText += seg.text;
    }
  }
  if (currentText && currentSpeaker) {
    lines.push(`[${currentSpeaker}] ${currentText.trim()}`);
  }

  return lines.join('\n');
}

export async function executeTool(
  name: string,
  input: Record<string, unknown>,
  ctx: ToolContext,
): Promise<ToolResult | null> {
  if (name !== 'TranscribeAudio') return null;

  const filePath = input.file_path as string | undefined;
  if (!filePath) {
    return { output: 'file_path は必須です', isError: true };
  }

  const serverUrl = ctx.toolsConfig?.speechServerUrl;
  if (!serverUrl) {
    return { output: 'speech_server_url が config.yaml に未設定です', isError: true };
  }

  const resolved = resolveAndGuard(ctx.workspacePath, filePath);
  const ext = path.extname(resolved).toLowerCase().replace('.', '');
  if (!SUPPORTED_EXTENSIONS.has(ext)) {
    return { output: `対応フォーマット: mp3, wav（指定: .${ext}）`, isError: true };
  }

  if (!fs.existsSync(resolved)) {
    return { output: `ファイルが見つかりません: ${filePath}`, isError: true };
  }

  const language = (input.language as string) ?? ctx.toolsConfig?.speechLanguage ?? 'ja';
  const diarize = input.diarize !== false;
  const prompt = input.prompt as string | undefined;
  const timeout = (ctx.toolsConfig?.speechTimeout ?? 300) * 1000;

  try {
    const fileBuffer = fs.readFileSync(resolved);
    const fileName = path.basename(resolved);
    const mimeType = ext === 'mp3' ? 'audio/mpeg' : 'audio/wav';

    const blob = new Blob([fileBuffer], { type: mimeType });

    const formData = new FormData();
    formData.append('file', blob, fileName);
    formData.append('language', language);
    formData.append('response_format', 'verbose_json');
    if (prompt) {
      formData.append('prompt', prompt);
    }

    const headers: Record<string, string> = {};
    if (diarize) {
      headers['X-Diarize'] = 'true';
    }

    const controller = new AbortController();
    const timer = setTimeout(() => controller.abort(), timeout);

    const url = `${serverUrl.replace(/\/+$/, '')}/audio/transcriptions`;
    const response = await fetch(url, {
      method: 'POST',
      body: formData,
      headers,
      signal: controller.signal,
    });
    clearTimeout(timer);

    if (!response.ok) {
      const errText = await response.text().catch(() => '');
      return {
        output: `音声認識サーバーエラー (${response.status}): ${errText}`.slice(0, 2000),
        isError: true,
      };
    }

    const data = await response.json() as TranscriptionResponse;
    const text = formatTranscription(data, diarize);

    if (!text) {
      return { output: '文字起こし結果が空です', isError: true };
    }

    const header = `## 文字起こし結果: ${fileName}\n\n`;
    return { output: header + text, isError: false };
  } catch (err: unknown) {
    if (err instanceof Error && err.name === 'AbortError') {
      return { output: `タイムアウト（${timeout / 1000}秒）: ${filePath}`, isError: true };
    }
    const msg = err instanceof Error ? err.message : String(err);
    return { output: `音声認識サーバーに接続できません: ${serverUrl} (${msg})`, isError: true };
  }
}