![guo zebin](/assets/img/avatar_default.png)
- 全新ui - 全面优化websocket逻辑,提高数字人和ui连接的稳定性及资源开销 - 全面优化唤醒逻辑,提供稳定的普通唤醒模式和前置词唤醒模式 - 优化拾音质量,支持多声道麦克风拾音 - 优化自动播放服务器的对接机制,提供稳定和兼容旧版ue工程的对接模式 - 数字人接口输出机器人表情,以适应新fay ui及单片机的数字人表情输出 - 使用更高级的音频时长计算方式,可以更精准控制音频播放完成后的逻辑 - 修复点击关闭按钮会导致程序退出的bug - 修复没有麦克风的设备开启麦克风会出错的问题 - 为服务器主机地址提供配置项,以方便服务器部署
135 lines
5.7 KiB
135 lines
5.7 KiB
import time
import asyncio
import azure.cognitiveservices.speech as speechsdk
import asyncio
from tts import tts_voice
from tts.tts_voice import EnumVoice
from utils import util, config_util
from utils import config_util as cfg
import pygame
import edge_tts
from pydub import AudioSegment
from scheduler.thread_manager import MyThread
class Speech:
def __init__(self):
self.ms_tts = False
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
if voice_type is not None:
voice_name = voice_type.value["voiceName"]
if config_util.key_ms_tts_key and config_util.key_ms_tts_key is not None and config_util.key_ms_tts_key.strip() != "":
self.__speech_config = speechsdk.SpeechConfig(subscription=cfg.key_ms_tts_key, region=cfg.key_ms_tts_region)
self.__speech_config.speech_recognition_language = "zh-CN"
self.__speech_config.speech_synthesis_voice_name = voice_name
self.__synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.__speech_config, audio_config=None)
self.ms_tts = True
self.__connection = None
self.__history_data = []
def __get_history(self, voice_name, style, text):
for data in self.__history_data:
if data[0] == voice_name and data[1] == style and data[2] == text:
return data[3]
return None
def connect(self):
if self.ms_tts:
self.__connection = speechsdk.Connection.from_speech_synthesizer(self.__synthesizer)
util.log(1, "TTS 服务已经连接!")
def close(self):
if self.__connection is not None:
async def get_edge_tts(self,text,voice,file_url) -> None:
communicate = edge_tts.Communicate(text, voice)
await communicate.save(file_url)
def convert_mp3_to_wav(self, mp3_filepath):
audio = AudioSegment.from_mp3(mp3_filepath)
# 使用 set_frame_rate 方法设置采样率
audio = audio.set_frame_rate(44100)
wav_filepath = mp3_filepath.rsplit(".", 1)[0] + ".wav"
audio.export(wav_filepath, format="wav")
return wav_filepath
:param text: 文本信息
:param style: 说话风格、语气
:returns: 音频文件路径
def to_sample(self, text, style):
if self.ms_tts:
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
if voice_type is not None:
voice_name = voice_type.value["voiceName"]
history = self.__get_history(voice_name, style, text)
if history is not None:
return history
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
'<voice name="{}">' \
'<mstts:express-as style="{}" styledegree="{}">' \
'{}' \
'</mstts:express-as>' \
'</voice>' \
'</speak>'.format(voice_name, style, 1.8, "<break time='0.2s'/>" + text)
result = self.__synthesizer.speak_text_async(text).get()
# result = self.__synthesizer.speak_ssml(ssml)#感觉使用sepak_text_async要快很多
audio_data_stream = speechsdk.AudioDataStream(result)
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.wav'
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
wav_url = file_url
self.__history_data.append((voice_name, style, text, wav_url))
return wav_url
util.log(1, "[x] 语音转换失败!")
util.log(1, "[x] 原因: " + str(result.reason))
return None
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
if voice_type is not None:
voice_name = voice_type.value["voiceName"]
history = self.__get_history(voice_name, style, text)
if history is not None:
return history
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
'<voice name="{}">' \
'<mstts:express-as style="{}" styledegree="{}">' \
'{}' \
'</mstts:express-as>' \
'</voice>' \
'</speak>'.format(voice_name, style, 1.8, text)
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.mp3'
wav_url = self.convert_mp3_to_wav(file_url)
self.__history_data.append((voice_name, style, text, wav_url))
except Exception as e :
util.log(1, "[x] 语音转换失败!")
util.log(1, "[x] 原因: " + str(str(e)))
wav_url = None
return wav_url
if __name__ == '__main__':
sp = Speech()
text = "我叫Fay,我今年18岁,很年青。"
s = sp.to_sample(text, "cheerful")