4cfad5ae0f
- 全新ui - 全面优化websocket逻辑,提高数字人和ui连接的稳定性及资源开销 - 全面优化唤醒逻辑,提供稳定的普通唤醒模式和前置词唤醒模式 - 优化拾音质量,支持多声道麦克风拾音 - 优化自动播放服务器的对接机制,提供稳定和兼容旧版ue工程的对接模式 - 数字人接口输出机器人表情,以适应新fay ui及单片机的数字人表情输出 - 使用更高级的音频时长计算方式,可以更精准控制音频播放完成后的逻辑 - 修复点击关闭按钮会导致程序退出的bug - 修复没有麦克风的设备开启麦克风会出错的问题 - 为服务器主机地址提供配置项,以方便服务器部署
179 lines
5.6 KiB
Python
179 lines
5.6 KiB
Python
'''
|
|
Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
|
|
Reserved. MIT License (https://opensource.org/licenses/MIT)
|
|
|
|
2022-2023 by zhaomingwork@qq.com
|
|
'''
|
|
# pip install websocket-client
|
|
import ssl
|
|
from websocket import ABNF
|
|
from websocket import create_connection
|
|
from queue import Queue
|
|
import threading
|
|
import traceback
|
|
import json
|
|
import time
|
|
import numpy as np
|
|
|
|
import pyaudio
|
|
import asyncio
|
|
import argparse
|
|
|
|
# class for recognizer in websocket
|
|
class Funasr_websocket_recognizer():
|
|
'''
|
|
python asr recognizer lib
|
|
|
|
'''
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--host", type=str, default="127.0.0.1", required=False, help="host ip, localhost, 0.0.0.0")
|
|
parser.add_argument("--port", type=int, default=10194, required=False, help="grpc server port")
|
|
parser.add_argument("--chunk_size", type=int, default=160, help="ms")
|
|
parser.add_argument("--vad_needed", type=bool, default=True)
|
|
args = parser.parse_args()
|
|
|
|
def __init__(self, host="127.0.0.1",
|
|
port="10197",
|
|
is_ssl=True,
|
|
chunk_size="0, 10, 5",
|
|
chunk_interval=10,
|
|
mode="2pass",
|
|
wav_name="default"):
|
|
'''
|
|
host: server host ip
|
|
port: server port
|
|
is_ssl: True for wss protocal, False for ws
|
|
'''
|
|
try:
|
|
if is_ssl == True:
|
|
ssl_context = ssl.SSLContext()
|
|
ssl_context.check_hostname = False
|
|
ssl_context.verify_mode = ssl.CERT_NONE
|
|
uri = "wss://{}:{}".format(host, port)
|
|
ssl_opt={"cert_reqs": ssl.CERT_NONE}
|
|
else:
|
|
uri = "ws://{}:{}".format(host, port)
|
|
ssl_context = None
|
|
ssl_opt=None
|
|
self.host = host
|
|
self.port = port
|
|
|
|
self.msg_queue = Queue() # used for recognized result text
|
|
|
|
print("connect to url",uri)
|
|
self.websocket=create_connection(uri, ssl=ssl_context, sslopt=ssl_opt)
|
|
|
|
self.thread_msg = threading.Thread(target=Funasr_websocket_recognizer.thread_rec_msg, args=(self,))
|
|
self.thread_msg.start()
|
|
chunk_size = [int(x) for x in chunk_size.split(",")]
|
|
stride = int(60 * chunk_size[1] / chunk_interval / 1000 * 16000 * 2)
|
|
chunk_num = (len(audio_bytes) - 1) // stride + 1
|
|
|
|
message = json.dumps({"mode": mode,
|
|
"chunk_size": chunk_size,
|
|
"encoder_chunk_look_back": 4,
|
|
"decoder_chunk_look_back": 1,
|
|
"chunk_interval": chunk_interval,
|
|
"wav_name": wav_name,
|
|
"is_speaking": True})
|
|
|
|
self.websocket.send(message)
|
|
|
|
print("send json",message)
|
|
|
|
except Exception as e:
|
|
print("Exception:", e)
|
|
traceback.print_exc()
|
|
|
|
# async def record():
|
|
# global voices
|
|
# FORMAT = pyaudio.paInt16
|
|
# CHANNELS = 1
|
|
# RATE = 16000
|
|
# CHUNK = int(RATE / 1000 * args.chunk_size)
|
|
|
|
# p = pyaudio.PyAudio()
|
|
|
|
# stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
|
|
|
|
# while True:
|
|
# data = stream.read(CHUNK)
|
|
# voices.put(data)
|
|
# await asyncio.sleep(0.01)
|
|
|
|
|
|
# threads for rev msg
|
|
def thread_rec_msg(self):
|
|
try:
|
|
while(True):
|
|
msg=self.websocket.recv()
|
|
if msg is None or len(msg) == 0:
|
|
continue
|
|
msg = json.loads(msg)
|
|
|
|
self.msg_queue.put(msg)
|
|
except Exception as e:
|
|
print("client closed")
|
|
|
|
# feed data to asr engine, wait_time means waiting for result until time out
|
|
def feed_chunk(self, chunk, wait_time=0.01):
|
|
try:
|
|
self.websocket.send(chunk, ABNF.OPCODE_BINARY)
|
|
# loop to check if there is a message, timeout in 0.01s
|
|
while(True):
|
|
msg = self.msg_queue.get(timeout=wait_time)
|
|
if self.msg_queue.empty():
|
|
break
|
|
|
|
return msg
|
|
except:
|
|
return ""
|
|
|
|
def close(self,timeout=1):
|
|
message = json.dumps({"is_speaking": False})
|
|
self.websocket.send(message)
|
|
# sleep for timeout seconds to wait for result
|
|
time.sleep(timeout)
|
|
msg=""
|
|
while(not self.msg_queue.empty()):
|
|
msg = self.msg_queue.get()
|
|
|
|
self.websocket.close()
|
|
# only resturn the last msg
|
|
return msg
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print('example for Funasr_websocket_recognizer')
|
|
import wave
|
|
wav_path = "long.wav"
|
|
# wav_path = "/Users/zhifu/Downloads/modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
|
|
with wave.open(wav_path, "rb") as wav_file:
|
|
params = wav_file.getparams()
|
|
frames = wav_file.readframes(wav_file.getnframes())
|
|
audio_bytes = bytes(frames)
|
|
|
|
|
|
stride = int(60 * 10 / 10 / 1000 * 16000 * 2)
|
|
chunk_num = (len(audio_bytes) - 1) // stride + 1
|
|
# create an recognizer
|
|
rcg = Funasr_websocket_recognizer()
|
|
# loop to send chunk
|
|
for i in range(chunk_num):
|
|
|
|
beg = i * stride
|
|
data = audio_bytes[beg:beg + stride]
|
|
|
|
text = rcg.feed_chunk(data,wait_time=0.02)
|
|
if len(text)>0:
|
|
print("text",text)
|
|
time.sleep(0.05)
|
|
|
|
# get last message
|
|
text = rcg.close(timeout=3)
|
|
print("text",text)
|
|
|
|
|
|
|