4cfad5ae0f
- 全新ui - 全面优化websocket逻辑,提高数字人和ui连接的稳定性及资源开销 - 全面优化唤醒逻辑,提供稳定的普通唤醒模式和前置词唤醒模式 - 优化拾音质量,支持多声道麦克风拾音 - 优化自动播放服务器的对接机制,提供稳定和兼容旧版ue工程的对接模式 - 数字人接口输出机器人表情,以适应新fay ui及单片机的数字人表情输出 - 使用更高级的音频时长计算方式,可以更精准控制音频播放完成后的逻辑 - 修复点击关闭按钮会导致程序退出的bug - 修复没有麦克风的设备开启麦克风会出错的问题 - 为服务器主机地址提供配置项,以方便服务器部署
135 lines
5.7 KiB
Python
135 lines
5.7 KiB
Python
import time
|
||
import asyncio
|
||
import azure.cognitiveservices.speech as speechsdk
|
||
import asyncio
|
||
from tts import tts_voice
|
||
from tts.tts_voice import EnumVoice
|
||
from utils import util, config_util
|
||
from utils import config_util as cfg
|
||
import pygame
|
||
import edge_tts
|
||
from pydub import AudioSegment
|
||
from scheduler.thread_manager import MyThread
|
||
|
||
class Speech:
|
||
def __init__(self):
|
||
self.ms_tts = False
|
||
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
|
||
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
|
||
if voice_type is not None:
|
||
voice_name = voice_type.value["voiceName"]
|
||
if config_util.key_ms_tts_key and config_util.key_ms_tts_key is not None and config_util.key_ms_tts_key.strip() != "":
|
||
self.__speech_config = speechsdk.SpeechConfig(subscription=cfg.key_ms_tts_key, region=cfg.key_ms_tts_region)
|
||
self.__speech_config.speech_recognition_language = "zh-CN"
|
||
self.__speech_config.speech_synthesis_voice_name = voice_name
|
||
self.__speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
|
||
self.__synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.__speech_config, audio_config=None)
|
||
self.ms_tts = True
|
||
self.__connection = None
|
||
self.__history_data = []
|
||
|
||
|
||
def __get_history(self, voice_name, style, text):
|
||
for data in self.__history_data:
|
||
if data[0] == voice_name and data[1] == style and data[2] == text:
|
||
return data[3]
|
||
return None
|
||
|
||
def connect(self):
|
||
if self.ms_tts:
|
||
self.__connection = speechsdk.Connection.from_speech_synthesizer(self.__synthesizer)
|
||
self.__connection.open(True)
|
||
util.log(1, "TTS 服务已经连接!")
|
||
|
||
def close(self):
|
||
if self.__connection is not None:
|
||
self.__connection.close()
|
||
|
||
#生成mp3音频
|
||
async def get_edge_tts(self,text,voice,file_url) -> None:
|
||
communicate = edge_tts.Communicate(text, voice)
|
||
await communicate.save(file_url)
|
||
|
||
def convert_mp3_to_wav(self, mp3_filepath):
|
||
audio = AudioSegment.from_mp3(mp3_filepath)
|
||
# 使用 set_frame_rate 方法设置采样率
|
||
audio = audio.set_frame_rate(44100)
|
||
wav_filepath = mp3_filepath.rsplit(".", 1)[0] + ".wav"
|
||
audio.export(wav_filepath, format="wav")
|
||
return wav_filepath
|
||
|
||
|
||
"""
|
||
文字转语音
|
||
:param text: 文本信息
|
||
:param style: 说话风格、语气
|
||
:returns: 音频文件路径
|
||
"""
|
||
|
||
def to_sample(self, text, style):
|
||
if self.ms_tts:
|
||
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
|
||
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
|
||
if voice_type is not None:
|
||
voice_name = voice_type.value["voiceName"]
|
||
history = self.__get_history(voice_name, style, text)
|
||
if history is not None:
|
||
return history
|
||
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
|
||
'<voice name="{}">' \
|
||
'<mstts:express-as style="{}" styledegree="{}">' \
|
||
'{}' \
|
||
'</mstts:express-as>' \
|
||
'</voice>' \
|
||
'</speak>'.format(voice_name, style, 1.8, "<break time='0.2s'/>" + text)
|
||
result = self.__synthesizer.speak_text_async(text).get()
|
||
# result = self.__synthesizer.speak_ssml(ssml)#感觉使用sepak_text_async要快很多
|
||
audio_data_stream = speechsdk.AudioDataStream(result)
|
||
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.wav'
|
||
audio_data_stream.save_to_wav_file(file_url)
|
||
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
||
wav_url = file_url
|
||
self.__history_data.append((voice_name, style, text, wav_url))
|
||
return wav_url
|
||
else:
|
||
util.log(1, "[x] 语音转换失败!")
|
||
util.log(1, "[x] 原因: " + str(result.reason))
|
||
return None
|
||
else:
|
||
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
|
||
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
|
||
if voice_type is not None:
|
||
voice_name = voice_type.value["voiceName"]
|
||
history = self.__get_history(voice_name, style, text)
|
||
if history is not None:
|
||
return history
|
||
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
|
||
'<voice name="{}">' \
|
||
'<mstts:express-as style="{}" styledegree="{}">' \
|
||
'{}' \
|
||
'</mstts:express-as>' \
|
||
'</voice>' \
|
||
'</speak>'.format(voice_name, style, 1.8, text)
|
||
try:
|
||
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.mp3'
|
||
asyncio.new_event_loop().run_until_complete(self.get_edge_tts(text,voice_name,file_url))
|
||
wav_url = self.convert_mp3_to_wav(file_url)
|
||
self.__history_data.append((voice_name, style, text, wav_url))
|
||
except Exception as e :
|
||
util.log(1, "[x] 语音转换失败!")
|
||
util.log(1, "[x] 原因: " + str(str(e)))
|
||
wav_url = None
|
||
return wav_url
|
||
|
||
|
||
if __name__ == '__main__':
|
||
cfg.load_config()
|
||
sp = Speech()
|
||
sp.connect()
|
||
text = "我叫Fay,我今年18岁,很年青。"
|
||
s = sp.to_sample(text, "cheerful")
|
||
|
||
print(s)
|
||
sp.close()
|
||
|