20230315

1、增加edge-tts语音合成（免费）可替换azure-tts(支持情绪化语音)； 2、替换flask发行版运行方式。
2023-03-15 02:22:50 +08:00 · 2023-03-15 02:22:50 +08:00 · 2dca3ad6db
commit 2dca3ad6db
parent 068f4d7d17
9 changed files with 113 additions and 71 deletions
--- a/README.md
+++ b/README.md
@ -6,11 +6,7 @@
 </div>


-		本开源项目名为“数字人控制器”。意为，本项目可以充当时下流行的虚拟人、虚拟主播、数字人，等仿人形数字形象的内核部分。
-
-		使用UE、C4D、DAZ、LIVE2D等三维引擎软件开发的数字形象可以与本“数字人控制器”对接，从而实现虚拟主播、数字导游、数字助手等。我们提供UE4对接的demo，但我们更鼓励用户自行实现喜欢的数字形象。
-
-		当然，若不考虑外观形象的话，本“数字人控制器”其实也可以独立使用的，可以充当一个语音助理。NLP可以自由切换AIUI、ChatGPT及Yuan1.0。
+		Fay是一个完整的开源项目，包含Python内核及UE数字人模型。开发人员可以利用该项目简单地构建各种类型的数字人或数字助理。该项目各模块之间耦合度非常低，包括声音来源、语音识别、情绪分析、NLP处理、情绪语音合成、语音输出和表情动作输出等模块。每个模块都可以轻松地更换。其中，NLP可以自由切换AIUI、GPT及Yuan1.0。

 <img src="images/5.png" alt="QA">

@ -30,6 +26,10 @@
 目前最新版本是2.0。在新版本里我们提出一个全新的架构。在这个架构下每个人都可以把Fay控制器搭建在自己个人电脑上（未来，或许我们会提供终端），让你电脑成为你数字助理的载体。你的所有设备（手表、手机、眼镜、笔记本）随时可以与你的数字助理通讯，数字助理将通过电脑为你处理数字世界里的所有事情。（贾维斯？Her?）
 ![](images/20230122074644.png)

+2023.03：
+1、增加edge-tts语音合成（免费）可替换azure-tts(支持情绪化语音)；
+2、替换flask发行版运行方式。
+
 2023.02：
 1、提供chatgpt及yuan1.0作为选择。

@ -201,7 +201,7 @@ python main.py
 | 代码模块                  | 描述                       | 链接                                                         |
 | ------------------------- | -------------------------- | ------------------------------------------------------------ |
 | ./ai_module/ali_nls.py    | 阿里云 实时语音识别        | https://ai.aliyun.com/nls/trans                              |
-| ./ai_module/ms_tts_sdk.py | 微软 文本转语音 基于SDK    | https://azure.microsoft.com/zh-cn/services/cognitive-services/text-to-speech/ |
+| ./ai_module/ms_tts_sdk.py | 微软 文本转情绪语音（可选）   | https://azure.microsoft.com/zh-cn/services/cognitive-services/text-to-speech/ |
 | ./ai_module/xf_ltp.py     | 讯飞 情感分析              | https://www.xfyun.cn/service/emotion-analysis                |
 | ./utils/ngrok_util.py     | ngrok.cc 外网穿透（可选）  | http://ngrok.cc                                              |
 | ./ai_module/yuan_1_0.py    | 浪潮源大模型（NLP 3选1）  | https://air.inspur.com/                                              |
@ -279,6 +279,6 @@ python main.py

 技术交流群

-<img src="images/20230217205435.jpg" alt="微信群">
+<img src="images/20230315021425.jpg" alt="微信群">


--- a/ai_module/ms_tts_sdk.py
+++ b/ai_module/ms_tts_sdk.py
@ -1,25 +1,33 @@
 import time

 import azure.cognitiveservices.speech as speechsdk
-
+import asyncio
+import sys
+sys.path.append("E:\\GitHub\\Fay\\")
 from core import tts_voice
 from core.tts_voice import EnumVoice
 from utils import util, config_util
 from utils import config_util as cfg
 import pygame
+import edge_tts
+



 class Speech:
    def __init__(self):
-        self.__speech_config = speechsdk.SpeechConfig(subscription=cfg.key_ms_tts_key, region=cfg.key_ms_tts_region)
-        self.__speech_config.speech_recognition_language = "zh-CN"
-        self.__speech_config.speech_synthesis_voice_name = "zh-CN-XiaoxiaoNeural"
-        self.__speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
-        self.__synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.__speech_config, audio_config=None)
+        self.ms_tts = False
+        if config_util.key_ms_tts_key and config_util.key_ms_tts_key is not None and config_util.key_ms_tts_key.strip() != "":
+            self.__speech_config = speechsdk.SpeechConfig(subscription=cfg.key_ms_tts_key, region=cfg.key_ms_tts_region)
+            self.__speech_config.speech_recognition_language = "zh-CN"
+            self.__speech_config.speech_synthesis_voice_name = "zh-CN-XiaoxiaoNeural"
+            self.__speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
+            self.__synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.__speech_config, audio_config=None)
+            self.ms_tts = True
        self.__connection = None
        self.__history_data = []

+
    def __get_history(self, voice_name, style, text):
        for data in self.__history_data:
            if data[0] == voice_name and data[1] == style and data[2] == text:
@ -27,14 +35,20 @@ class Speech:
        return None

    def connect(self):
-        self.__connection = speechsdk.Connection.from_speech_synthesizer(self.__synthesizer)
-        self.__connection.open(True)
+        if self.ms_tts:
+            self.__connection = speechsdk.Connection.from_speech_synthesizer(self.__synthesizer)
+            self.__connection.open(True)
        util.log(1, "TTS 服务已经连接！")

    def close(self):
        if self.__connection is not None:
            self.__connection.close()

+    #生成mp3音频
+    async def get_edge_tts(self,text,voice,file_url) -> None:
+        communicate = edge_tts.Communicate(text, voice)
+        await communicate.save(file_url)
+
    """
    文字转语音
    :param text: 文本信息
@ -43,49 +57,66 @@ class Speech:
    """

    def to_sample(self, text, style):
-        voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
-        voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
-        if voice_type is not None:
-            voice_name = voice_type.value["voiceName"]
-        history = self.__get_history(voice_name, style, text)
-        if history is not None:
-            return history
-        ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
-               '<voice name="{}">' \
-               '<mstts:express-as style="{}" styledegree="{}">' \
-               '{}' \
-               '</mstts:express-as>' \
-               '</voice>' \
-               '</speak>'.format(voice_name, style, 1.8, text)
-        result = self.__synthesizer.speak_ssml(ssml)
-        audio_data_stream = speechsdk.AudioDataStream(result)
+        if self.ms_tts:
+            voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
+            voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
+            if voice_type is not None:
+                voice_name = voice_type.value["voiceName"]
+            history = self.__get_history(voice_name, style, text)
+            if history is not None:
+                return history
+            ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
+                   '<voice name="{}">' \
+                   '<mstts:express-as style="{}" styledegree="{}">' \
+                   '{}' \
+                   '</mstts:express-as>' \
+                   '</voice>' \
+                   '</speak>'.format(voice_name, style, 1.8, text)
+            result = self.__synthesizer.speak_ssml(ssml)
+            audio_data_stream = speechsdk.AudioDataStream(result)

-        file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.wav'
-        audio_data_stream.save_to_wav_file(file_url)
-        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
-            self.__history_data.append((voice_name, style, text, file_url))
-            return file_url
+            file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.mp3'
+            audio_data_stream.save_to_wav_file(file_url)
+            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+                self.__history_data.append((voice_name, style, text, file_url))
+                return file_url
+            else:
+                util.log(1, "[x] 语音转换失败！")
+                util.log(1, "[x] 原因: " + str(result.reason))
+                return None
        else:
-            util.log(1, "[x] 语音转换失败！")
-            util.log(1, "[x] 原因: " + str(result.reason))
-            return None
+            voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
+            voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
+            if voice_type is not None:
+                voice_name = voice_type.value["voiceName"]
+            history = self.__get_history(voice_name, style, text)
+            if history is not None:
+                return history
+            ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
+                   '<voice name="{}">' \
+                   '<mstts:express-as style="{}" styledegree="{}">' \
+                   '{}' \
+                   '</mstts:express-as>' \
+                   '</voice>' \
+                   '</speak>'.format(voice_name, style, 1.8, text)
+            try:
+                file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.mp3'
+                asyncio.new_event_loop().run_until_complete(self.get_edge_tts(text,voice_name,file_url))
+                self.__history_data.append((voice_name, style, text, file_url))
+            except Exception as e :
+                util.log(1, "[x] 语音转换失败！")
+                util.log(1, "[x] 原因: " + str(str(e)))
+                file_url = None
+            return file_url
+
+
 if __name__ == '__main__':
    cfg.load_config()
    sp = Speech()
    sp.connect()
-    pygame.init()
-    text = """一座城市，总有一条标志性道路，它见证着这座城市的时代变迁，并随着城市历史积淀砥砺前行，承载起城市的非凡荣耀。季华路，见证了佛山的崛起，从而也被誉为“最代表佛山城市发展的一条路”。季华路位于佛山市禅城区，是佛山市总体道路规划网中东西走向的城市主干道，全长20公里，是佛山市公路网络规划"四纵、九横、两环"主骨架中的重要组成部分，西接禅城南庄、高明、三水，东连南海、广州，横跨佛山一环、禅西大道、佛山大道、岭南大道、南海大道五大主干道，贯穿中心城区四个镇街，沿途经过多处文化古迹和重要产业区，是名副其实的“交通动脉”。同时季华路也是佛山的经济“大动脉”，代表着佛山蓬勃发展的现在，也影响着佛山日新月异的未来。
-        季华六路起于南海大道到文华北截至，道路为东西走向，全长1.5公里，该路段为1996年完成建设并投入使用，该道路为一级公路，路面使用混凝土材质，道路为双向5车道，路宽30米，途径1个行政单位，一条隧道，该路段设有格栅518个，两边护栏1188米，沙井盖158个，其中供水26个，市政77个，移动通讯2个，联通通讯3个，电信通讯3个，交通信号灯1个，人行天桥2个，电梯4台，标志牌18个，标线为1.64万米。
-        道路南行是文华中路，可通往亚洲艺术公园，亚洲艺术公园位于佛山市发展区的中心，占地40公顷，其中水体面积26.6公顷，以岭南水乡为文脉，以水上森林为绿脉，以龙舟竞渡为水脉，通过建筑、雕塑、植物、桥梁等设计要素，营造出一个具有亚洲艺术风采的艺术园地。曾获选佛山十大最美公园之一。
-        道路北行是文华北路，可通往佛山市委市政府。佛山市委市政府是广东省佛山市的行政管理机关。
-        道路西行到达文华公园。佛山市文华公园位于佛山市禅城区季华路以南（电视塔旁）、文华路以西，大福路以东路段，建设面积约11万平方米，主要将传统文化和现代园林有机结合，全园布局以大树木、大草坪、多彩植被和人工湖为表现主体，精致的溪涧、小桥、亲水平台点缀其间，通过棕榈植物错落有序的巧妙搭配，令园区既蕴涵亚热带曼妙风情，又不失岭南园艺的独特风采。通过“借景”、“透景”造园手法，与邻近的电视塔相映成趣，它的落成，为附近市民的休闲生活添上了色彩绚丽的一笔。
-
-        季华五路是季华路最先建设的一段道路，起于岭南大道到佛山大道截至，道路为东西走向，全长2.1公里，该路段为1993年完成建设并投入使用，该道路为一级公路，路面使用混凝土材质，道路为双向5车道，路宽30米，途径1个行政单位，该路段设有格栅634个，两边护栏1310米，沙井盖180个，其中供水30个，市政81个，移动通讯5个，联通通讯3个，交通信号灯2个，人行天桥3个，电梯12台，标志牌26个，标线为2.131万米。
-        沿途经过季华园，季华园即佛山季华公园，位于佛山市城南新区，1994年5月建成。占地200多亩。场内所有设施免费使用。景点介绍风格清新、意境优雅季华公园是具有亚热带风光的大型开放游览性公园。由于场内所有设施免费使用，地方广阔，每天都吸引着众多的游人前来休闲、运动等。
-        道路南行是佛山大道中，可通往乐从方向乐从镇，地处珠三角腹地，广佛经济圈核心带，是国家级重大国际产业、城市发展合作平台--中德工业服务区、中欧城镇化合作示范区的核心。
-        道路北行佛山大道中，可通往佛山火车站，佛山火车站是广东省的铁路枢纽之一，广三铁路经过该站。"""
+    text = """这是一段音频，测试一下3"""
    s = sp.to_sample(text, "cheerful")
+
    print(s)
-    pygame.mixer.music.load(s)
-    pygame.mixer.music.play()
-    sp.close()
+    sp.close()
+
--- a/core/fay_core.py
+++ b/core/fay_core.py
@ -492,9 +492,9 @@ class FeiFei:

    def __send_audio(self, file_url, say_type):
        try:
-            # audio_length = eyed3.load(file_url).info.time_secs mp3音频长度
-            with wave.open(file_url, 'rb') as wav_file:
-                audio_length = wav_file.getnframes() / float(wav_file.getframerate())
+            audio_length = eyed3.load(file_url).info.time_secs #mp3音频长度
+            # with wave.open(file_url, 'rb') as wav_file: #wav音频长度
+            #     audio_length = wav_file.getnframes() / float(wav_file.getframerate())
            if audio_length <= config_util.config["interact"]["maxInteractTime"] or say_type == "script":
                if config_util.config["interact"]["playSound"]: # 播放音频
                    self.__play_sound(file_url)
--- a/core/recorder.py
+++ b/core/recorder.py
@ -3,11 +3,6 @@ import math
 import time
 from abc import abstractmethod

-
-import pyaudio
-import wave
-
-
 from ai_module.ali_nls import ALiNls
 from core import wsa_server
 from scheduler.thread_manager import MyThread
@ -89,9 +84,7 @@ class Recorder:

   
    def __record(self):
-        self.total = 0
-    
-        stream = self.get_stream()
+        stream = self.get_stream() #把get stream的方式封装出来方便实现麦克风录制及网络流等不同的流录制子类

        isSpeaking = False
        last_mute_time = time.time()
@ -100,8 +93,6 @@ class Recorder:
            data = stream.read(1024, exception_on_overflow=False)
            if not data:
                continue
-            else:
-                self.total += len(data)

            level = audioop.rms(data, 2)
            if len(self.__history_data) >= 5:
--- a/gui/flask_server.py
+++ b/gui/flask_server.py
@ -6,10 +6,12 @@ from flask import Flask, render_template, request
 from flask_cors import CORS

 import fay_booter
-from core import wsa_server
+
 from core.tts_voice import EnumVoice
+from gevent import pywsgi
 from scheduler.thread_manager import MyThread
 from utils import config_util
+from core import wsa_server

 __app = Flask(__name__)
 CORS(__app, supports_credentials=True)
@ -78,6 +80,9 @@ def home_get():
 def home_post():
    return __get_template()

+def run():
+    server = pywsgi.WSGIServer(('0.0.0.0',5000), __app)
+    server.serve_forever()

 def start():
-    MyThread(target=__app.run).start()
+    MyThread(target=run).start()
--- a/images/20230315021425.jpg
+++ b/images/20230315021425.jpg
--- a/main.py
+++ b/main.py
@ -17,7 +17,7 @@ def __clear_samples():
    if not os.path.exists("./samples"):
        os.mkdir("./samples")
    for file_name in os.listdir('./samples'):
-        if file_name.startswith('sample-') and file_name.endswith('.wav'):
+        if file_name.startswith('sample-') and file_name.endswith('.mp3'):
            os.remove('./samples/' + file_name)


@ -25,7 +25,7 @@ def __clear_songs():
    if not os.path.exists("./songs"):
        os.mkdir("./songs")
    for file_name in os.listdir('./songs'):
-        if file_name.endswith('.wav'):
+        if file_name.endswith('.mp3'):
            os.remove('./songs/' + file_name)


--- a/requirements.txt
+++ b/requirements.txt
@ -18,4 +18,6 @@ aliyun-python-sdk-core==2.13.3
 scipy~=1.10.0
 openai~=0.26.5
 simhash
-pytz
+pytz
+gevent~=22.10.1
+edge_tts~=6.1.3
--- a/test/test_flask.py
+++ b/test/test_flask.py
@ -0,0 +1,13 @@
+from flask import Flask
+from flask_cors import CORS
+from gevent import pywsgi
+
+app = Flask("test_server")
+CORS(app, supports_credentials=True)
+
+@app.route('/', methods=['get'])
+def abc():
+    return 'hello world'
+
+server = pywsgi.WSGIServer(('0.0.0.0',5000), app)
+server.serve_forever()