cherry-studio/src/renderer/src/services/ASRService.ts
2025-04-11 20:05:32 +08:00

671 lines
22 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import i18n from '@renderer/i18n'
import store from '@renderer/store'
/**
* ASR服务用于将语音转换为文本
*/
class ASRService {
private mediaRecorder: MediaRecorder | null = null
private audioChunks: Blob[] = []
private isRecording = false
private stream: MediaStream | null = null
// WebSocket相关
private ws: WebSocket | null = null
private wsConnected = false
private browserReady = false
private reconnectAttempt = 0
private maxReconnectAttempts = 5
private reconnectTimeout: NodeJS.Timeout | null = null
/**
* 开始录音
* @returns Promise<void>
*/
/**
* 连接到WebSocket服务器
* @returns Promise<boolean> 是否连接成功
*/
connectToWebSocketServer = async (): Promise<boolean> => {
return new Promise((resolve) => {
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
console.log('[ASRService] WebSocket已连接')
resolve(true)
return
}
if (this.ws && this.ws.readyState === WebSocket.CONNECTING) {
console.log('[ASRService] WebSocket正在连接中')
// 等待连接完成
this.ws.onopen = () => {
console.log('[ASRService] WebSocket连接成功')
this.wsConnected = true
this.reconnectAttempt = 0
this.ws?.send(JSON.stringify({ type: 'identify', role: 'electron' }))
resolve(true)
}
this.ws.onerror = () => {
console.error('[ASRService] WebSocket连接失败')
this.wsConnected = false
resolve(false)
}
return
}
// 关闭之前的连接
if (this.ws) {
try {
this.ws.close()
} catch (e) {
console.error('[ASRService] 关闭WebSocket连接失败:', e)
}
}
// 创建新连接
try {
console.log('[ASRService] 正在连接WebSocket服务器...')
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.loading({ content: '正在连接语音识别服务...', key: 'ws-connect' })
}, 0)
this.ws = new WebSocket('ws://localhost:8080')
this.wsConnected = false
this.browserReady = false
this.ws.onopen = () => {
console.log('[ASRService] WebSocket连接成功')
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.success({ content: '语音识别服务连接成功', key: 'ws-connect' })
}, 0)
this.wsConnected = true
this.reconnectAttempt = 0
this.ws?.send(JSON.stringify({ type: 'identify', role: 'electron' }))
resolve(true)
}
this.ws.onclose = () => {
console.log('[ASRService] WebSocket连接关闭')
this.wsConnected = false
this.browserReady = false
this.attemptReconnect()
}
this.ws.onerror = (error) => {
console.error('[ASRService] WebSocket连接错误:', error)
this.wsConnected = false
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.error({ content: '语音识别服务连接失败', key: 'ws-connect' })
}, 0)
resolve(false)
}
this.ws.onmessage = this.handleWebSocketMessage
} catch (error) {
console.error('[ASRService] 创建WebSocket连接失败:', error)
window.message.error({ content: '语音识别服务连接失败', key: 'ws-connect' })
resolve(false)
}
})
}
/**
* 处理WebSocket消息
*/
private handleWebSocketMessage = (event: MessageEvent) => {
try {
const data = JSON.parse(event.data)
console.log('[ASRService] 收到WebSocket消息:', data)
if (data.type === 'status') {
if (data.message === 'browser_ready' || data.message === 'Browser connected') {
console.log('[ASRService] 浏览器已准备好')
this.browserReady = true
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.success({ content: '语音识别浏览器已准备好', key: 'browser-status' })
}, 0)
} else if (data.message === 'Browser disconnected' || data.message === 'Browser connection error') {
console.log('[ASRService] 浏览器断开连接')
this.browserReady = false
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.error({ content: '语音识别浏览器断开连接', key: 'browser-status' })
}, 0)
} else if (data.message === 'stopped') {
// 语音识别已停止
console.log('[ASRService] 语音识别已停止')
this.isRecording = false
// 如果没有收到最终结果,显示处理完成消息
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.success({ content: i18n.t('settings.asr.completed'), key: 'asr-processing' })
}, 0)
} else if (data.message === 'reset_complete') {
// 语音识别已重置
console.log('[ASRService] 语音识别已强制重置')
this.isRecording = false
// 保存当前回调函数并立即清除
const tempCallback = this.resultCallback
this.resultCallback = null
// 显示重置完成消息
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.info({ content: '语音识别已重置', key: 'asr-reset' })
}, 0)
// 如果有回调函数,调用一次空字符串,触发按钮状态重置
if (tempCallback && typeof tempCallback === 'function') {
// 使用空字符串调用回调,不会影响输入框,但可以触发按钮状态重置
const callback = tempCallback as (text: string, isFinal?: boolean) => void // 明确指定类型
setTimeout(() => {
callback('', false)
}, 100)
}
}
} else if (data.type === 'result' && data.data) {
// 处理识别结果
console.log('[ASRService] 收到识别结果:', data.data)
// 如果已经停止录音但仍然收到结果,检查是否是最终结果
if (!this.isRecording && !data.data.isFinal) {
console.log('[ASRService] 已停止录音但收到非最终结果,忽略')
return
}
if (this.resultCallback && typeof this.resultCallback === 'function') {
// 将所有结果都传递给回调函数并包含isFinal状态
if (data.data.text && data.data.text.trim()) {
if (data.data.isFinal) {
console.log('[ASRService] 收到最终结果,调用回调函数,文本:', data.data.text)
// 保存当前回调函数并立即清除,防止重复处理
const tempCallback = this.resultCallback
this.resultCallback = null
// 调用回调函数
tempCallback(data.data.text, true)
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.success({ content: i18n.t('settings.asr.success'), key: 'asr-processing' })
}, 0)
} else if (this.isRecording) { // 只在录音中才处理中间结果
// 非最终结果,也调用回调,但标记为非最终
console.log('[ASRService] 收到中间结果,调用回调函数,文本:', data.data.text)
this.resultCallback(data.data.text, false)
}
} else {
console.log('[ASRService] 识别结果为空,不调用回调')
}
} else {
console.warn('[ASRService] 没有设置结果回调函数')
}
} else if (data.type === 'error') {
console.error('[ASRService] 收到错误消息:', data.message || data.data)
// 使用 setTimeout 避免在渲染过程中调用 message API
setTimeout(() => {
window.message.error({
content: `语音识别错误: ${data.message || data.data?.error || '未知错误'}`,
key: 'asr-error'
})
}, 0)
}
} catch (error) {
console.error('[ASRService] 解析WebSocket消息失败:', error, event.data)
}
}
/**
* 尝试重新连接WebSocket服务器
*/
private attemptReconnect = () => {
if (this.reconnectTimeout) {
clearTimeout(this.reconnectTimeout)
this.reconnectTimeout = null
}
if (this.reconnectAttempt >= this.maxReconnectAttempts) {
console.log('[ASRService] 达到最大重连次数,停止重连')
return
}
const delay = Math.min(1000 * Math.pow(2, this.reconnectAttempt), 30000)
console.log(
`[ASRService] 将在 ${delay}ms 后尝试重连 (尝试 ${this.reconnectAttempt + 1}/${this.maxReconnectAttempts})`
)
this.reconnectTimeout = setTimeout(() => {
this.reconnectAttempt++
this.connectToWebSocketServer().catch(console.error)
}, delay)
}
// 存储结果回调函数
resultCallback: ((text: string, isFinal?: boolean) => void) | null = null
startRecording = async (onTranscribed?: (text: string, isFinal?: boolean) => void): Promise<void> => {
try {
const { asrEnabled, asrServiceType } = store.getState().settings
if (!asrEnabled) {
window.message.error({ content: i18n.t('settings.asr.error.not_enabled'), key: 'asr-error' })
return
}
// 检查是否已经在录音
if (this.isRecording) {
console.log('已经在录音中,忽略此次请求')
return
}
// 如果是使用本地服务器
if (asrServiceType === 'local') {
// 连接WebSocket服务器
const connected = await this.connectToWebSocketServer()
if (!connected) {
throw new Error('无法连接到语音识别服务')
}
// 检查浏览器是否准备好
if (!this.browserReady) {
// 尝试等待浏览器准备好
let waitAttempts = 0
const maxWaitAttempts = 5
// 尝试打开浏览器页面
try {
// 发送消息提示用户
window.message.info({
content: '正在准备语音识别服务...',
key: 'browser-status'
})
// 尝试自动打开浏览器页面
try {
// 使用ASRServerService获取服务器URL
const serverUrl = 'http://localhost:8080'
console.log('尝试打开语音识别服务器页面:', serverUrl)
window.open(serverUrl, '_blank')
} catch (error) {
console.error('获取服务器URL失败:', error)
}
} catch (error) {
console.error('打开语音识别浏览器页面失败:', error)
}
while (!this.browserReady && waitAttempts < maxWaitAttempts) {
window.message.loading({
content: `等待浏览器准备就绪 (${waitAttempts + 1}/${maxWaitAttempts})...`,
key: 'browser-status'
})
// 等待一秒
await new Promise((resolve) => setTimeout(resolve, 1000))
waitAttempts++
}
if (!this.browserReady) {
window.message.warning({
content: '语音识别浏览器尚未准备好,请确保已打开浏览器页面',
key: 'browser-status'
})
throw new Error('浏览器尚未准备好')
}
}
// 保存回调函数(如果提供了)
if (onTranscribed && typeof onTranscribed === 'function') {
this.resultCallback = onTranscribed
}
// 发送开始命令
if (this.ws && this.wsConnected) {
this.ws.send(JSON.stringify({ type: 'start' }))
this.isRecording = true
console.log('开始语音识别')
window.message.info({ content: i18n.t('settings.asr.recording'), key: 'asr-recording' })
} else {
throw new Error('WebSocket连接未就绪')
}
return
}
// 以下是原有的录音逻辑OpenAI或浏览器API
// 请求麦克风权限
this.stream = await navigator.mediaDevices.getUserMedia({ audio: true })
// 创建MediaRecorder实例
this.mediaRecorder = new MediaRecorder(this.stream)
// 清空之前的录音数据
this.audioChunks = []
// 设置数据可用时的回调
this.mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
this.audioChunks.push(event.data)
}
}
// 开始录音
this.mediaRecorder.start()
this.isRecording = true
console.log('开始录音')
window.message.info({ content: i18n.t('settings.asr.recording'), key: 'asr-recording' })
} catch (error) {
console.error('开始录音失败:', error)
window.message.error({
content: i18n.t('settings.asr.error.start_failed') + ': ' + (error as Error).message,
key: 'asr-error'
})
this.isRecording = false
}
}
/**
* 停止录音并转换为文本
* @param onTranscribed 转录完成后的回调函数
* @returns Promise<void>
*/
stopRecording = async (onTranscribed: (text: string, isFinal?: boolean) => void): Promise<void> => {
const { asrServiceType } = store.getState().settings
// 如果是使用本地服务器
if (asrServiceType === 'local') {
if (!this.isRecording) {
console.log('没有正在进行的语音识别')
return
}
try {
// 保存回调函数
this.resultCallback = onTranscribed
// 发送停止命令
if (this.ws && this.wsConnected) {
this.ws.send(JSON.stringify({ type: 'stop' }))
console.log('停止语音识别')
window.message.loading({ content: i18n.t('settings.asr.processing'), key: 'asr-processing' })
// 立即调用回调函数,使按钮状态立即更新
if (onTranscribed) {
// 使用空字符串调用回调,不会影响输入框,但可以触发按钮状态重置
// 传递false表示这不是最终结果只是状态更新
setTimeout(() => {
onTranscribed('', false)
}, 100)
}
// 添加额外的安全措施,确保在停止后也清除回调
setTimeout(() => {
// 在停止后的一段时间内清除回调,防止后续结果被处理
this.resultCallback = null
}, 3000) // 3秒后清除回调
} else {
throw new Error('WebSocket连接未就绪')
}
// 重置录音状态
this.isRecording = false
} catch (error) {
console.error('停止语音识别失败:', error)
window.message.error({
content: i18n.t('settings.asr.error.transcribe_failed') + ': ' + (error as Error).message,
key: 'asr-processing'
})
this.isRecording = false
}
return
}
// 以下是原有的录音停止逻辑OpenAI或浏览器API
if (!this.isRecording || !this.mediaRecorder) {
console.log('没有正在进行的录音')
return
}
try {
// 创建一个Promise等待录音结束
const recordingEndedPromise = new Promise<Blob>((resolve) => {
if (this.mediaRecorder) {
this.mediaRecorder.onstop = () => {
// 将所有音频块合并为一个Blob
const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm' })
resolve(audioBlob)
}
// 停止录音
this.mediaRecorder.stop()
}
})
// 停止所有轨道
if (this.stream) {
this.stream.getTracks().forEach((track) => track.stop())
this.stream = null
}
// 等待录音结束并获取音频Blob
const audioBlob = await recordingEndedPromise
// 重置录音状态
this.isRecording = false
this.mediaRecorder = null
console.log('录音结束,音频大小:', audioBlob.size, 'bytes')
// 显示处理中消息
window.message.loading({ content: i18n.t('settings.asr.processing'), key: 'asr-processing' })
if (asrServiceType === 'openai') {
// 使用OpenAI的Whisper API进行语音识别
await this.transcribeWithOpenAI(audioBlob, onTranscribed)
} else if (asrServiceType === 'browser') {
// 使用浏览器的Web Speech API进行语音识别
await this.transcribeWithBrowser(audioBlob, onTranscribed)
} else {
throw new Error(`不支持的ASR服务类型: ${asrServiceType}`)
}
} catch (error) {
console.error('停止录音或转录失败:', error)
window.message.error({
content: i18n.t('settings.asr.error.transcribe_failed') + ': ' + (error as Error).message,
key: 'asr-processing'
})
// 重置录音状态
this.isRecording = false
this.mediaRecorder = null
if (this.stream) {
this.stream.getTracks().forEach((track) => track.stop())
this.stream = null
}
}
}
/**
* 使用OpenAI的Whisper API进行语音识别
* @param audioBlob 音频Blob
* @param onTranscribed 转录完成后的回调函数
* @returns Promise<void>
*/
private transcribeWithOpenAI = async (audioBlob: Blob, onTranscribed: (text: string) => void): Promise<void> => {
try {
const { asrApiKey, asrApiUrl, asrModel } = store.getState().settings
if (!asrApiKey) {
throw new Error(i18n.t('settings.asr.error.no_api_key'))
}
// 创建FormData对象
const formData = new FormData()
formData.append('file', audioBlob, 'recording.webm')
formData.append('model', asrModel || 'whisper-1')
// 调用OpenAI API
const response = await fetch(asrApiUrl, {
method: 'POST',
headers: {
Authorization: `Bearer ${asrApiKey}`
},
body: formData
})
if (!response.ok) {
const errorData = await response.json()
throw new Error(errorData.error?.message || 'OpenAI语音识别失败')
}
// 解析响应
const data = await response.json()
const transcribedText = data.text
if (transcribedText) {
console.log('语音识别成功:', transcribedText)
window.message.success({ content: i18n.t('settings.asr.success'), key: 'asr-processing' })
onTranscribed(transcribedText)
} else {
throw new Error('未能识别出文本')
}
} catch (error) {
console.error('OpenAI语音识别失败:', error)
throw error
}
}
/**
* 使用浏览器的Web Speech API进行语音识别
* @param audioBlob 音频Blob
* @param onTranscribed 转录完成后的回调函数
* @returns Promise<void>
*/
private transcribeWithBrowser = async (_audioBlob: Blob, onTranscribed: (text: string) => void): Promise<void> => {
try {
// 检查浏览器是否支持Web Speech API
if (!('webkitSpeechRecognition' in window) && !('SpeechRecognition' in window)) {
throw new Error(i18n.t('settings.asr.error.browser_not_support'))
}
// 由于Web Speech API不支持直接处理录制的音频这里我们只是模拟一个成功的回调
// 实际上使用Web Speech API时应该直接使用SpeechRecognition对象进行实时识别
// 这里简化处理,实际项目中可能需要更复杂的实现
window.message.success({ content: i18n.t('settings.asr.success'), key: 'asr-processing' })
onTranscribed('浏览器语音识别功能尚未完全实现')
} catch (error) {
console.error('浏览器语音识别失败:', error)
throw error
}
}
/**
* 检查是否正在录音
* @returns boolean
*/
isCurrentlyRecording = (): boolean => {
return this.isRecording
}
/**
* 检查WebSocket是否已连接
* @returns boolean
*/
isWebSocketConnected = (): boolean => {
return this.wsConnected && this.browserReady
}
/**
* 取消录音
*/
cancelRecording = (): void => {
const { asrServiceType } = store.getState().settings
// 如果是使用本地服务器
if (asrServiceType === 'local') {
// 修改条件,即使不在录音中也进行重置
if (this.isRecording || this.resultCallback) {
// 先重置状态和回调,确保不会处理后续结果
this.isRecording = false
this.resultCallback = null
// 发送停止命令
if (this.ws && this.wsConnected) {
this.ws.send(JSON.stringify({ type: 'stop' }))
console.log('发送停止命令到WebSocket服务器')
// 发送一个额外的命令,要求浏览器强制重置语音识别
setTimeout(() => {
if (this.ws && this.wsConnected) {
this.ws.send(JSON.stringify({ type: 'reset' }))
console.log('发送重置命令到WebSocket服务器')
}
}, 100)
}
console.log('语音识别已取消')
window.message.info({ content: i18n.t('settings.asr.canceled'), key: 'asr-recording' })
}
return
}
// 以下是原有的取消录音逻辑OpenAI或浏览器API
if (this.isRecording && this.mediaRecorder) {
// 停止MediaRecorder
this.mediaRecorder.stop()
// 停止所有轨道
if (this.stream) {
this.stream.getTracks().forEach((track) => track.stop())
this.stream = null
}
// 重置状态
this.isRecording = false
this.mediaRecorder = null
this.audioChunks = []
console.log('录音已取消')
window.message.info({ content: i18n.t('settings.asr.canceled'), key: 'asr-recording' })
}
}
/**
* 关闭WebSocket连接
*/
closeWebSocketConnection = (): void => {
if (this.ws) {
try {
this.ws.close()
} catch (e) {
console.error('[ASRService] 关闭WebSocket连接失败:', e)
}
this.ws = null
}
this.wsConnected = false
this.browserReady = false
if (this.reconnectTimeout) {
clearTimeout(this.reconnectTimeout)
this.reconnectTimeout = null
}
}
/**
* 打开浏览器页面
*/
openBrowserPage = (): void => {
// 使用window.open打开浏览器页面
window.open('http://localhost:8080', '_blank')
}
}
// 创建单例实例
const instance = new ASRService()
export default instance