mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2025-12-24 18:50:56 +08:00
644 lines
19 KiB
TypeScript
644 lines
19 KiB
TypeScript
import fs from 'node:fs'
|
||
import path from 'node:path'
|
||
|
||
import { app } from 'electron'
|
||
import log from 'electron-log'
|
||
import { EdgeTTS } from 'node-edge-tts' // 旧版TTS库
|
||
import { MsEdgeTTS, OUTPUT_FORMAT } from 'edge-tts-node' // 新版支持流式的TTS库
|
||
|
||
// --- START OF HARDCODED VOICE LIST ---
|
||
// WARNING: This list is static and may become outdated.
|
||
// It's generally recommended to use listVoices() for the most up-to-date list.
|
||
const hardcodedVoices = [
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)',
|
||
ShortName: 'af-ZA-AdriNeural',
|
||
Gender: 'Female',
|
||
Locale: 'af-ZA'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (am-ET, MekdesNeural)',
|
||
ShortName: 'am-ET-MekdesNeural',
|
||
Gender: 'Female',
|
||
Locale: 'am-ET'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ar-AE, FatimaNeural)',
|
||
ShortName: 'ar-AE-FatimaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'ar-AE'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ar-AE, HamdanNeural)',
|
||
ShortName: 'ar-AE-HamdanNeural',
|
||
Gender: 'Male',
|
||
Locale: 'ar-AE'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ar-BH, AliNeural)',
|
||
ShortName: 'ar-BH-AliNeural',
|
||
Gender: 'Male',
|
||
Locale: 'ar-BH'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ar-BH, LailaNeural)',
|
||
ShortName: 'ar-BH-LailaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'ar-BH'
|
||
},
|
||
// ... (Many other Arabic locales/voices) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ar-SA, ZariyahNeural)',
|
||
ShortName: 'ar-SA-ZariyahNeural',
|
||
Gender: 'Female',
|
||
Locale: 'ar-SA'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (az-AZ, BabekNeural)',
|
||
ShortName: 'az-AZ-BabekNeural',
|
||
Gender: 'Male',
|
||
Locale: 'az-AZ'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (az-AZ, BanuNeural)',
|
||
ShortName: 'az-AZ-BanuNeural',
|
||
Gender: 'Female',
|
||
Locale: 'az-AZ'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (bg-BG, BorislavNeural)',
|
||
ShortName: 'bg-BG-BorislavNeural',
|
||
Gender: 'Male',
|
||
Locale: 'bg-BG'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (bg-BG, KalinaNeural)',
|
||
ShortName: 'bg-BG-KalinaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'bg-BG'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (bn-BD, NabanitaNeural)',
|
||
ShortName: 'bn-BD-NabanitaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'bn-BD'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (bn-BD, PradeepNeural)',
|
||
ShortName: 'bn-BD-PradeepNeural',
|
||
Gender: 'Male',
|
||
Locale: 'bn-BD'
|
||
},
|
||
// ... (Catalan, Czech, Welsh, Danish, German, Greek, English variants) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-AU, NatashaNeural)',
|
||
ShortName: 'en-AU-NatashaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-AU'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-AU, WilliamNeural)',
|
||
ShortName: 'en-AU-WilliamNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-AU'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-CA, ClaraNeural)',
|
||
ShortName: 'en-CA-ClaraNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-CA'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-CA, LiamNeural)',
|
||
ShortName: 'en-CA-LiamNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-CA'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-GB, LibbyNeural)',
|
||
ShortName: 'en-GB-LibbyNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-GB'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-GB, MaisieNeural)',
|
||
ShortName: 'en-GB-MaisieNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-GB'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)',
|
||
ShortName: 'en-GB-RyanNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-GB'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-GB, SoniaNeural)',
|
||
ShortName: 'en-GB-SoniaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-GB'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-GB, ThomasNeural)',
|
||
ShortName: 'en-GB-ThomasNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-GB'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-HK, SamNeural)',
|
||
ShortName: 'en-HK-SamNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-HK'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-HK, YanNeural)',
|
||
ShortName: 'en-HK-YanNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-HK'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-IE, ConnorNeural)',
|
||
ShortName: 'en-IE-ConnorNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-IE'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-IE, EmilyNeural)',
|
||
ShortName: 'en-IE-EmilyNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-IE'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaNeural)',
|
||
ShortName: 'en-IN-NeerjaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-IN'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-IN, PrabhatNeural)',
|
||
ShortName: 'en-IN-PrabhatNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-IN'
|
||
},
|
||
// ... (Many more English variants: KE, NG, NZ, PH, SG, TZ, US, ZA) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)',
|
||
ShortName: 'en-US-AriaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, AnaNeural)',
|
||
ShortName: 'en-US-AnaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, ChristopherNeural)',
|
||
ShortName: 'en-US-ChristopherNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, EricNeural)',
|
||
ShortName: 'en-US-EricNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)',
|
||
ShortName: 'en-US-GuyNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)',
|
||
ShortName: 'en-US-JennyNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, MichelleNeural)',
|
||
ShortName: 'en-US-MichelleNeural',
|
||
Gender: 'Female',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, RogerNeural)',
|
||
ShortName: 'en-US-RogerNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-US'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (en-US, SteffanNeural)',
|
||
ShortName: 'en-US-SteffanNeural',
|
||
Gender: 'Male',
|
||
Locale: 'en-US'
|
||
},
|
||
// ... (Spanish variants) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)',
|
||
ShortName: 'es-MX-DaliaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'es-MX'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)',
|
||
ShortName: 'es-MX-JorgeNeural',
|
||
Gender: 'Male',
|
||
Locale: 'es-MX'
|
||
},
|
||
// ... (Estonian, Basque, Persian, Finnish, Filipino, French, Irish, Galician, Gujarati, Hebrew, Hindi, Croatian, Hungarian, Indonesian, Icelandic, Italian, Japanese) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ja-JP, KeitaNeural)',
|
||
ShortName: 'ja-JP-KeitaNeural',
|
||
Gender: 'Male',
|
||
Locale: 'ja-JP'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ja-JP, NanamiNeural)',
|
||
ShortName: 'ja-JP-NanamiNeural',
|
||
Gender: 'Female',
|
||
Locale: 'ja-JP'
|
||
},
|
||
// ... (Javanese, Georgian, Kazakh, Khmer, Kannada, Korean) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ko-KR, InJoonNeural)',
|
||
ShortName: 'ko-KR-InJoonNeural',
|
||
Gender: 'Male',
|
||
Locale: 'ko-KR'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (ko-KR, SunHiNeural)',
|
||
ShortName: 'ko-KR-SunHiNeural',
|
||
Gender: 'Female',
|
||
Locale: 'ko-KR'
|
||
},
|
||
// ... (Lao, Lithuanian, Latvian, Macedonian, Malayalam, Mongolian, Marathi, Malay, Maltese, Burmese, Norwegian, Dutch, Polish, Pashto, Portuguese) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (pt-BR, AntonioNeural)',
|
||
ShortName: 'pt-BR-AntonioNeural',
|
||
Gender: 'Male',
|
||
Locale: 'pt-BR'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (pt-BR, FranciscaNeural)',
|
||
ShortName: 'pt-BR-FranciscaNeural',
|
||
Gender: 'Female',
|
||
Locale: 'pt-BR'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (pt-PT, DuarteNeural)',
|
||
ShortName: 'pt-PT-DuarteNeural',
|
||
Gender: 'Male',
|
||
Locale: 'pt-PT'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (pt-PT, RaquelNeural)',
|
||
ShortName: 'pt-PT-RaquelNeural',
|
||
Gender: 'Female',
|
||
Locale: 'pt-PT'
|
||
},
|
||
// ... (Romanian, Russian, Sinhala, Slovak, Slovenian, Somali, Albanian, Serbian, Sundanese, Swedish, Swahili, Tamil, Telugu, Thai) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (th-TH, NiwatNeural)',
|
||
ShortName: 'th-TH-NiwatNeural',
|
||
Gender: 'Male',
|
||
Locale: 'th-TH'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (th-TH, PremwadeeNeural)',
|
||
ShortName: 'th-TH-PremwadeeNeural',
|
||
Gender: 'Female',
|
||
Locale: 'th-TH'
|
||
},
|
||
// ... (Turkish, Ukrainian, Urdu, Uzbek, Vietnamese) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (vi-VN, HoaiMyNeural)',
|
||
ShortName: 'vi-VN-HoaiMyNeural',
|
||
Gender: 'Female',
|
||
Locale: 'vi-VN'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (vi-VN, NamMinhNeural)',
|
||
ShortName: 'vi-VN-NamMinhNeural',
|
||
Gender: 'Male',
|
||
Locale: 'vi-VN'
|
||
},
|
||
// ... (Chinese variants) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)',
|
||
ShortName: 'zh-CN-XiaoxiaoNeural',
|
||
Gender: 'Female',
|
||
Locale: 'zh-CN'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)',
|
||
ShortName: 'zh-CN-YunxiNeural',
|
||
Gender: 'Male',
|
||
Locale: 'zh-CN'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-CN, YunjianNeural)',
|
||
ShortName: 'zh-CN-YunjianNeural',
|
||
Gender: 'Male',
|
||
Locale: 'zh-CN'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiaNeural)',
|
||
ShortName: 'zh-CN-YunxiaNeural',
|
||
Gender: 'Male',
|
||
Locale: 'zh-CN'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-CN, YunyangNeural)',
|
||
ShortName: 'zh-CN-YunyangNeural',
|
||
Gender: 'Male',
|
||
Locale: 'zh-CN'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-CN-liaoning, XiaobeiNeural)',
|
||
ShortName: 'zh-CN-liaoning-XiaobeiNeural',
|
||
Gender: 'Female',
|
||
Locale: 'zh-CN-liaoning'
|
||
},
|
||
// { Name: 'Microsoft Server Speech Text to Speech Voice (zh-CN-shaanxi, XiaoniNeural)', ShortName: 'zh-CN-shaanxi-XiaoniNeural', Gender: 'Female', Locale: 'zh-CN-shaanxi' }, // Example regional voice
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-HK, HiuGaaiNeural)',
|
||
ShortName: 'zh-HK-HiuGaaiNeural',
|
||
Gender: 'Female',
|
||
Locale: 'zh-HK'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)',
|
||
ShortName: 'zh-HK-HiuMaanNeural',
|
||
Gender: 'Female',
|
||
Locale: 'zh-HK'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-HK, WanLungNeural)',
|
||
ShortName: 'zh-HK-WanLungNeural',
|
||
Gender: 'Male',
|
||
Locale: 'zh-HK'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoChenNeural)',
|
||
ShortName: 'zh-TW-HsiaoChenNeural',
|
||
Gender: 'Female',
|
||
Locale: 'zh-TW'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoYuNeural)',
|
||
ShortName: 'zh-TW-HsiaoYuNeural',
|
||
Gender: 'Female',
|
||
Locale: 'zh-TW'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zh-TW, YunJheNeural)',
|
||
ShortName: 'zh-TW-YunJheNeural',
|
||
Gender: 'Male',
|
||
Locale: 'zh-TW'
|
||
},
|
||
// ... (Zulu) ...
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zu-ZA, ThandoNeural)',
|
||
ShortName: 'zu-ZA-ThandoNeural',
|
||
Gender: 'Female',
|
||
Locale: 'zu-ZA'
|
||
},
|
||
{
|
||
Name: 'Microsoft Server Speech Text to Speech Voice (zu-ZA, ThembaNeural)',
|
||
ShortName: 'zu-ZA-ThembaNeural',
|
||
Gender: 'Male',
|
||
Locale: 'zu-ZA'
|
||
}
|
||
]
|
||
// --- END OF HARDCODED VOICE LIST ---
|
||
|
||
/**
|
||
* 免费在线TTS服务
|
||
* 使用免费的在线TTS服务,不需要API密钥
|
||
*/
|
||
class MsTTSService {
|
||
private static instance: MsTTSService
|
||
private tempDir: string
|
||
|
||
private constructor() {
|
||
this.tempDir = path.join(app.getPath('temp'), 'cherry-tts')
|
||
if (!fs.existsSync(this.tempDir)) {
|
||
fs.mkdirSync(this.tempDir, { recursive: true })
|
||
}
|
||
log.info('初始化免费在线TTS服务 (使用硬编码语音列表)')
|
||
}
|
||
|
||
public static getInstance(): MsTTSService {
|
||
if (!MsTTSService.instance) {
|
||
MsTTSService.instance = new MsTTSService()
|
||
}
|
||
return MsTTSService.instance
|
||
}
|
||
|
||
/**
|
||
* 流式合成语音
|
||
* @param text 要合成的文本
|
||
* @param voice 语音的 ShortName (例如 'zh-CN-XiaoxiaoNeural')
|
||
* @param outputFormat 输出格式 (例如 'audio-24khz-48kbitrate-mono-mp3')
|
||
* @param onData 数据块回调
|
||
* @param onEnd 结束回调
|
||
*/
|
||
public async synthesizeStream(
|
||
text: string,
|
||
voice: string,
|
||
outputFormat: string,
|
||
onData: (chunk: Uint8Array) => void,
|
||
onEnd: () => void
|
||
): Promise<void> {
|
||
try {
|
||
// 记录详细的请求信息
|
||
log.info(`流式微软在线TTS合成语音: 文本="${text.substring(0, 30)}...", 语音=${voice}, 格式=${outputFormat}`)
|
||
|
||
// 验证输入参数
|
||
if (!text || text.trim() === '') {
|
||
throw new Error('要合成的文本不能为空')
|
||
}
|
||
|
||
if (!voice || voice.trim() === '') {
|
||
throw new Error('语音名称不能为空')
|
||
}
|
||
|
||
// 创建一个新的MsEdgeTTS实例
|
||
const tts = new MsEdgeTTS({
|
||
enableLogger: false // 禁用内部日志
|
||
})
|
||
|
||
// 设置元数据
|
||
let msOutputFormat: OUTPUT_FORMAT
|
||
if (outputFormat.includes('mp3')) {
|
||
msOutputFormat = OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3
|
||
} else if (outputFormat.includes('webm')) {
|
||
msOutputFormat = OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS
|
||
} else {
|
||
msOutputFormat = OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3
|
||
}
|
||
|
||
await tts.setMetadata(voice, msOutputFormat)
|
||
|
||
// 创建流
|
||
const audioStream = tts.toStream(text)
|
||
|
||
// 监听数据事件
|
||
audioStream.on('data', (data: Buffer) => {
|
||
onData(data)
|
||
})
|
||
|
||
// 监听结束事件
|
||
audioStream.on('end', () => {
|
||
log.info(`流式微软在线TTS合成成功`)
|
||
onEnd()
|
||
})
|
||
|
||
// 监听错误事件
|
||
audioStream.on('error', (error: Error) => {
|
||
log.error(`流式微软在线TTS语音合成失败:`, error)
|
||
throw error
|
||
})
|
||
} catch (error: any) {
|
||
// 记录详细的错误信息
|
||
log.error(`流式微软在线TTS语音合成失败 (语音=${voice}):`, error)
|
||
throw error
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 获取可用的语音列表 (返回硬编码列表)
|
||
* @returns 语音列表
|
||
*/
|
||
public async getVoices(): Promise<any[]> {
|
||
try {
|
||
log.info(`返回硬编码的 ${hardcodedVoices.length} 个语音列表`)
|
||
// 直接返回硬编码的列表
|
||
// 注意:保持 async 是为了接口兼容性,虽然这里没有实际的异步操作
|
||
return hardcodedVoices
|
||
} catch (error) {
|
||
// 这个 try/catch 在这里意义不大了,因为返回静态数据不会出错
|
||
// 但保留结构以防未来改动
|
||
log.error('获取硬编码语音列表时出错 (理论上不应发生):', error)
|
||
return [] // 返回空列表以防万一
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 合成语音
|
||
* @param text 要合成的文本
|
||
* @param voice 语音的 ShortName (例如 'zh-CN-XiaoxiaoNeural')
|
||
* @param outputFormat 输出格式 (例如 'audio-24khz-48kbitrate-mono-mp3')
|
||
* @returns 音频文件路径
|
||
*/
|
||
public async synthesize(text: string, voice: string, outputFormat: string): Promise<string> {
|
||
try {
|
||
// 记录详细的请求信息
|
||
log.info(`微软在线TTS合成语音: 文本="${text.substring(0, 30)}...", 语音=${voice}, 格式=${outputFormat}`)
|
||
|
||
// 验证输入参数
|
||
if (!text || text.trim() === '') {
|
||
throw new Error('要合成的文本不能为空')
|
||
}
|
||
|
||
if (!voice || voice.trim() === '') {
|
||
throw new Error('语音名称不能为空')
|
||
}
|
||
|
||
// 创建一个新的EdgeTTS实例,并设置参数
|
||
// 添加超时设置,默认为30秒
|
||
const tts = new EdgeTTS({
|
||
voice: voice,
|
||
outputFormat: outputFormat,
|
||
timeout: 30000, // 30秒超时
|
||
rate: '+0%', // 正常语速
|
||
pitch: '+0Hz', // 正常音调
|
||
volume: '+0%' // 正常音量
|
||
})
|
||
|
||
// 生成临时文件路径
|
||
const timestamp = Date.now()
|
||
const fileExtension = outputFormat.includes('mp3') ? 'mp3' : outputFormat.split('-').pop() || 'audio'
|
||
const outputPath = path.join(this.tempDir, `tts_${timestamp}.${fileExtension}`)
|
||
|
||
log.info(`开始生成语音文件: ${outputPath}`)
|
||
|
||
// 使用ttsPromise方法生成文件
|
||
await tts.ttsPromise(text, outputPath)
|
||
|
||
// 验证生成的文件是否存在且大小大于0
|
||
if (!fs.existsSync(outputPath)) {
|
||
throw new Error(`生成的语音文件不存在: ${outputPath}`)
|
||
}
|
||
|
||
const stats = fs.statSync(outputPath)
|
||
if (stats.size === 0) {
|
||
throw new Error(`生成的语音文件大小为0: ${outputPath}`)
|
||
}
|
||
|
||
log.info(`微软在线TTS合成成功: ${outputPath}, 文件大小: ${stats.size} 字节`)
|
||
return outputPath
|
||
} catch (error: any) {
|
||
// 记录详细的错误信息
|
||
log.error(`微软在线TTS语音合成失败 (语音=${voice}):`, error)
|
||
|
||
// 尝试提供更有用的错误信息
|
||
if (error.message && typeof error.message === 'string') {
|
||
if (error.message.includes('Timed out')) {
|
||
throw new Error(`语音合成超时,请检查网络连接或尝试其他语音`)
|
||
} else if (error.message.includes('ENOTFOUND')) {
|
||
throw new Error(`无法连接到微软语音服务,请检查网络连接`)
|
||
} else if (error.message.includes('ECONNREFUSED')) {
|
||
throw new Error(`连接被拒绝,请检查网络设置或代理配置`)
|
||
}
|
||
}
|
||
|
||
throw error
|
||
}
|
||
}
|
||
|
||
/**
|
||
* (可选) 清理临时文件目录
|
||
*/
|
||
public async cleanupTempDir(): Promise<void> {
|
||
// (Cleanup method remains the same)
|
||
try {
|
||
const files = await fs.promises.readdir(this.tempDir)
|
||
for (const file of files) {
|
||
if (file.startsWith('tts_')) {
|
||
await fs.promises.unlink(path.join(this.tempDir, file))
|
||
}
|
||
}
|
||
log.info('TTS 临时文件已清理')
|
||
} catch (error) {
|
||
log.error('清理 TTS 临时文件失败:', error)
|
||
}
|
||
}
|
||
}
|
||
|
||
// 导出单例方法 (保持不变)
|
||
export const getVoices = async () => {
|
||
return await MsTTSService.getInstance().getVoices()
|
||
}
|
||
|
||
export const synthesize = async (text: string, voice: string, outputFormat: string) => {
|
||
return await MsTTSService.getInstance().synthesize(text, voice, outputFormat)
|
||
}
|
||
|
||
export const synthesizeStream = async (
|
||
text: string,
|
||
voice: string,
|
||
outputFormat: string,
|
||
onData: (chunk: Uint8Array) => void,
|
||
onEnd: () => void
|
||
) => {
|
||
return await MsTTSService.getInstance().synthesizeStream(text, voice, outputFormat, onData, onEnd)
|
||
}
|
||
|
||
export const cleanupTtsTempFiles = async () => {
|
||
await MsTTSService.getInstance().cleanupTempDir()
|
||
}
|