135 lines
5.7 KiB
Python
135 lines
5.7 KiB
Python
|
import time
|
|||
|
import asyncio
|
|||
|
import azure.cognitiveservices.speech as speechsdk
|
|||
|
import asyncio
|
|||
|
from tts import tts_voice
|
|||
|
from tts.tts_voice import EnumVoice
|
|||
|
from utils import util, config_util
|
|||
|
from utils import config_util as cfg
|
|||
|
import pygame
|
|||
|
import edge_tts
|
|||
|
from pydub import AudioSegment
|
|||
|
from scheduler.thread_manager import MyThread
|
|||
|
|
|||
|
class Speech:
|
|||
|
def __init__(self):
|
|||
|
self.ms_tts = False
|
|||
|
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
|
|||
|
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
|
|||
|
if voice_type is not None:
|
|||
|
voice_name = voice_type.value["voiceName"]
|
|||
|
if config_util.key_ms_tts_key and config_util.key_ms_tts_key is not None and config_util.key_ms_tts_key.strip() != "":
|
|||
|
self.__speech_config = speechsdk.SpeechConfig(subscription=cfg.key_ms_tts_key, region=cfg.key_ms_tts_region)
|
|||
|
self.__speech_config.speech_recognition_language = "zh-CN"
|
|||
|
self.__speech_config.speech_synthesis_voice_name = voice_name
|
|||
|
self.__speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
|
|||
|
self.__synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.__speech_config, audio_config=None)
|
|||
|
self.ms_tts = True
|
|||
|
self.__connection = None
|
|||
|
self.__history_data = []
|
|||
|
|
|||
|
|
|||
|
def __get_history(self, voice_name, style, text):
|
|||
|
for data in self.__history_data:
|
|||
|
if data[0] == voice_name and data[1] == style and data[2] == text:
|
|||
|
return data[3]
|
|||
|
return None
|
|||
|
|
|||
|
def connect(self):
|
|||
|
if self.ms_tts:
|
|||
|
self.__connection = speechsdk.Connection.from_speech_synthesizer(self.__synthesizer)
|
|||
|
self.__connection.open(True)
|
|||
|
util.log(1, "TTS 服务已经连接!")
|
|||
|
|
|||
|
def close(self):
|
|||
|
if self.__connection is not None:
|
|||
|
self.__connection.close()
|
|||
|
|
|||
|
#生成mp3音频
|
|||
|
async def get_edge_tts(self,text,voice,file_url) -> None:
|
|||
|
communicate = edge_tts.Communicate(text, voice)
|
|||
|
await communicate.save(file_url)
|
|||
|
|
|||
|
def convert_mp3_to_wav(self, mp3_filepath):
|
|||
|
audio = AudioSegment.from_mp3(mp3_filepath)
|
|||
|
# 使用 set_frame_rate 方法设置采样率
|
|||
|
audio = audio.set_frame_rate(44100)
|
|||
|
wav_filepath = mp3_filepath.rsplit(".", 1)[0] + ".wav"
|
|||
|
audio.export(wav_filepath, format="wav")
|
|||
|
return wav_filepath
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
文字转语音
|
|||
|
:param text: 文本信息
|
|||
|
:param style: 说话风格、语气
|
|||
|
:returns: 音频文件路径
|
|||
|
"""
|
|||
|
|
|||
|
def to_sample(self, text, style):
|
|||
|
if self.ms_tts:
|
|||
|
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
|
|||
|
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
|
|||
|
if voice_type is not None:
|
|||
|
voice_name = voice_type.value["voiceName"]
|
|||
|
history = self.__get_history(voice_name, style, text)
|
|||
|
if history is not None:
|
|||
|
return history
|
|||
|
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
|
|||
|
'<voice name="{}">' \
|
|||
|
'<mstts:express-as style="{}" styledegree="{}">' \
|
|||
|
'{}' \
|
|||
|
'</mstts:express-as>' \
|
|||
|
'</voice>' \
|
|||
|
'</speak>'.format(voice_name, style, 1.8, "<break time='0.2s'/>" + text)
|
|||
|
result = self.__synthesizer.speak_text_async(text).get()
|
|||
|
# result = self.__synthesizer.speak_ssml(ssml)#感觉使用sepak_text_async要快很多
|
|||
|
audio_data_stream = speechsdk.AudioDataStream(result)
|
|||
|
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.wav'
|
|||
|
audio_data_stream.save_to_wav_file(file_url)
|
|||
|
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
|||
|
wav_url = file_url
|
|||
|
self.__history_data.append((voice_name, style, text, wav_url))
|
|||
|
return wav_url
|
|||
|
else:
|
|||
|
util.log(1, "[x] 语音转换失败!")
|
|||
|
util.log(1, "[x] 原因: " + str(result.reason))
|
|||
|
return None
|
|||
|
else:
|
|||
|
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
|
|||
|
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
|
|||
|
if voice_type is not None:
|
|||
|
voice_name = voice_type.value["voiceName"]
|
|||
|
history = self.__get_history(voice_name, style, text)
|
|||
|
if history is not None:
|
|||
|
return history
|
|||
|
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
|
|||
|
'<voice name="{}">' \
|
|||
|
'<mstts:express-as style="{}" styledegree="{}">' \
|
|||
|
'{}' \
|
|||
|
'</mstts:express-as>' \
|
|||
|
'</voice>' \
|
|||
|
'</speak>'.format(voice_name, style, 1.8, text)
|
|||
|
try:
|
|||
|
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.mp3'
|
|||
|
asyncio.new_event_loop().run_until_complete(self.get_edge_tts(text,voice_name,file_url))
|
|||
|
wav_url = self.convert_mp3_to_wav(file_url)
|
|||
|
self.__history_data.append((voice_name, style, text, wav_url))
|
|||
|
except Exception as e :
|
|||
|
util.log(1, "[x] 语音转换失败!")
|
|||
|
util.log(1, "[x] 原因: " + str(str(e)))
|
|||
|
wav_url = None
|
|||
|
return wav_url
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
cfg.load_config()
|
|||
|
sp = Speech()
|
|||
|
sp.connect()
|
|||
|
text = "我叫Fay,我今年18岁,很年青。"
|
|||
|
s = sp.to_sample(text, "cheerful")
|
|||
|
|
|||
|
print(s)
|
|||
|
sp.close()
|
|||
|
|