olivebot/core/recorder.py

#作用是音频录制，对于aliyun asr来说，边录制边stt，但对于其他来说，是先保存成文件再推送给asr模型，通过实现子类的方式（fay_booter.py 上有实现）来管理音频流的来源
import audioop
import math
import time
import threading
from abc import abstractmethod

from asr.ali_nls import ALiNls
from asr.funasr import FunASR
from core import wsa_server
from scheduler.thread_manager import MyThread
from utils import util
from utils import config_util as cfg
import numpy as np
import tempfile
import wave
from core import fay_core
from core import interact

import re

# 启动时间 (秒)
_ATTACK = 0.05 # 更快进入拾音，减少短唤醒词被漏掉的概率

# 释放时间 (秒)
_RELEASE = 0.7


class Recorder:

    def __init__(self, fay):
        self.__fay = fay
        self.__running = True
        self.__processing = False
        self.__history_level = []
        self.__history_data = []
        self.__dynamic_threshold = 0.5 # 声音识别的音量阈值

        self.__MAX_LEVEL = 25000
        self.__MAX_BLOCK = 100

        #Edit by xszyou in 20230516:增加本地asr
        self.ASRMode = cfg.ASR_mode
        self.__aLiNls = None
        self.is_awake = False
        self.wakeup_matched = False
        if cfg.config['source']['wake_word_enabled']:
            self.timer = threading.Timer(60, self.reset_wakeup_status)  # 60秒后执行reset_wakeup_status方法
        self.username = 'User' #默认用户，子类实现时会重写
        self.channels = 1
        self.sample_rate = 16000
        self.is_reading = False
        self.stream = None

    def asrclient(self):
        if self.ASRMode == "ali":
            asrcli = ALiNls(self.username)
        elif self.ASRMode == "funasr" or self.ASRMode == "sensevoice":
            asrcli = FunASR(self.username)
        return asrcli

    def save_buffer_to_file(self, buffer):
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav", dir="cache_data")
        wf = wave.open(temp_file.name, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(16000)
        wf.writeframes(buffer)
        wf.close()
        return temp_file.name

    def __get_history_average(self, number):
        total = 0
        num = 0
        for i in range(len(self.__history_level) - 1, -1, -1):
            level = self.__history_level[i]
            total += level
            num += 1
            if num >= number:
                break
        return total / num

    def __get_history_percentage(self, number):
        return (self.__get_history_average(number) / self.__MAX_LEVEL) * 1.05 + 0.02

    def reset_wakeup_status(self):
        self.wakeup_matched = False
        with fay_core.auto_play_lock:
            fay_core.can_auto_play = True

    def _norm_asr_text(self, s: str) -> str:
        """
        大厅场景：ASR 结果常见问题
        - 句首有空格/标点/语气词（嗯、啊、这个...）
        - 唤醒词后跟逗号、冒号等
        所以需要做最小必要的规范化，避免 front 模式误判为“待唤醒”
        """
        if not s:
            return ""
        s = s.strip()
        # 去掉句首标点/空白
        s = re.sub(r'^[\s，,。.!！？:：、~～]+', '', s)
        # 去掉句首常见语气词（按需可继续加）
        s = re.sub(r'^(嗯|啊|呃|额|哦|唔|这个|那个)\s*', '', s)
        return s

    def _strip_wake_prefix(self, full_text: str, wake_word: str) -> str:
        """
        去除前置唤醒词，只把“真正的问题”交给对话系统
        例如：'小F，播放音乐' -> '播放音乐'
        """
        rest = full_text[len(wake_word):]
        # 吞掉唤醒词后的空白/标点
        return rest.lstrip(" \t，,。.!！？:：、~～")

    def __waitingResult(self, iat: asrclient, audio_data):
        self.processing = True
        t = time.time()
        tm = time.time()
        if self.ASRMode == "funasr"  or self.ASRMode == "sensevoice":
            file_url = self.save_buffer_to_file(audio_data)
            self.__aLiNls.send_url(file_url)

        # return
        # 等待结果返回
        while not iat.done and time.time() - t < 1:
            time.sleep(0.01)
        text = iat.finalResults
        util.printInfo(1, self.username, "语音处理完成！ 耗时: {} ms".format(math.floor((time.time() - tm) * 1000)))
        if len(text) > 0:
            if cfg.config['source']['wake_word_enabled']:
                #普通唤醒模式
                if cfg.config['source']['wake_word_type'] == 'common':

                    if not self.wakeup_matched:
                        #唤醒词判断
                        wake_word =  cfg.config['source']['wake_word']
                        wake_word_list = wake_word.split(',')
                        wake_up = False
                        for word in wake_word_list:
                            if word in text:
                                    wake_up = True
                        if wake_up:
                            util.printInfo(1, self.username, "唤醒成功！")
                            if wsa_server.get_web_instance().is_connected(self.username):
                                wsa_server.get_web_instance().add_cmd({"panelMsg": "唤醒成功！", "Username" : self.username , 'robot': f'http://{cfg.fay_url}:5000/robot/Listening.jpg'})
                            if wsa_server.get_instance().is_connected(self.username):
                                content = {'Topic': 'Unreal', 'Data': {'Key': 'log', 'Value': "唤醒成功！"}, 'Username' : self.username, 'robot': f'http://{cfg.fay_url}:5000/robot/Listening.jpg'}
                                wsa_server.get_instance().add_cmd(content)
                            self.wakeup_matched = True  # 唤醒成功
                            with fay_core.auto_play_lock:
                                fay_core.can_auto_play = False
                            #self.on_speaking(text)
                            intt = interact.Interact("auto_play", 2, {'user': self.username, 'text': "在呢，你说？"})
                            self.__fay.on_interact(intt)
                            self.processing = False
                            self.timer.cancel()  # 取消之前的计时器任务

                        else:
                            util.printInfo(1, self.username, "[!] 待唤醒！")
                            if wsa_server.get_web_instance().is_connected(self.username):
                                wsa_server.get_web_instance().add_cmd({"panelMsg": "[!] 待唤醒！", "Username" : self.username , 'robot': f'http://{cfg.fay_url}:5000/robot/Normal.jpg'})
                            if wsa_server.get_instance().is_connected(self.username):
                                content = {'Topic': 'Unreal', 'Data': {'Key': 'log', 'Value': "[!] 待唤醒！"}, 'Username' : self.username, 'robot': f'http://{cfg.fay_url}:5000/robot/Normal.jpg'}
                                wsa_server.get_instance().add_cmd(content)
                    else:
                        self.on_speaking(text)
                        self.processing = False
                        self.timer.cancel()  # 取消之前的计时器任务
                        self.timer = threading.Timer(60, self.reset_wakeup_status)  # 重设计时器为60秒
                        self.timer.start()

                # 前置唤醒词模式（大厅优化版）
                elif cfg.config['source']['wake_word_type'] == 'front':
                    wake_word = cfg.config['source']['wake_word']
                    wake_word_list = [w.strip() for w in wake_word.split(',') if w.strip()]

                    raw_text = text
                    text2 = self._norm_asr_text(raw_text)

                    wake_up = False
                    matched_word = None

                    # 1) 规范化后做“严格句首匹配”
                    for w in wake_word_list:
                        w2 = self._norm_asr_text(w)
                        if w2 and text2.startswith(w2):
                            wake_up = True
                            matched_word = w2
                            break

                    # 2) 容错：允许唤醒词出现在句首很短范围内（防止语气词未完全清掉）
                    #    注意：范围要小，避免大厅误唤醒
                    if not wake_up:
                        N = 4  # 建议 3~6，越大越容易误触发
                        head = text2[:N]
                        for w in wake_word_list:
                            w2 = self._norm_asr_text(w)
                            if w2 and w2 in head:
                                wake_up = True
                                matched_word = w2
                                # 从唤醒词出现的位置截断，确保 strip 正确
                                idx = text2.find(w2)
                                text2 = text2[idx:]
                                break

                    if wake_up:
                        util.printInfo(1, self.username, f"唤醒成功！(front:{matched_word})")
                        if wsa_server.get_web_instance().is_connected(self.username):
                            wsa_server.get_web_instance().add_cmd({
                                "panelMsg": "唤醒成功！",
                                "Username": self.username,
                                'robot': f'http://{cfg.fay_url}:5000/robot/Listening.jpg'
                            })
                        if wsa_server.get_instance().is_connected(self.username):
                            content = {
                                'Topic': 'Unreal',
                                'Data': {'Key': 'log', 'Value': "唤醒成功！"},
                                'Username': self.username,
                                'robot': f'http://{cfg.fay_url}:5000/robot/Listening.jpg'
                            }
                            wsa_server.get_instance().add_cmd(content)

                        # ✅ 关键：剥离唤醒词，把真正问题交给对话系统
                        question = self._strip_wake_prefix(text2, matched_word)

                        # 如果只说了唤醒词（或后面太短），给一句提示
                        if not question:
                            question = "在呢，你说？"

                        self.on_speaking(question)
                        self.processing = False
                    else:
                        # ✅ 关键：打印原始识别和规范化后文本，现场好定位为何没匹配上
                        util.printInfo(1, self.username, f"[!] 待唤醒！(front) ASR='{raw_text}' norm='{text2}'")
                        if wsa_server.get_web_instance().is_connected(self.username):
                            wsa_server.get_web_instance().add_cmd({
                                "panelMsg": "[!] 待唤醒！",
                                "Username": self.username,
                                'robot': f'http://{cfg.fay_url}:5000/robot/Normal.jpg'
                            })
                        if wsa_server.get_instance().is_connected(self.username):
                            content = {
                                'Topic': 'Unreal',
                                'Data': {'Key': 'log', 'Value': "[!] 待唤醒！"},
                                'Username': self.username,
                                'robot': f'http://{cfg.fay_url}:5000/robot/Normal.jpg'
                            }
                            wsa_server.get_instance().add_cmd(content)

            #非唤醒模式
            else:
                 self.on_speaking(text)
                 self.processing = False
        else:
            #TODO 为什么这个设为False
            # if self.wakeup_matched:
            #     self.wakeup_matched = False
            self.processing = False
            util.printInfo(1, self.username, "[!] 语音未检测到内容！")
            self.dynamic_threshold = self.__get_history_percentage(30)
            if wsa_server.get_web_instance().is_connected(self.username):
                wsa_server.get_web_instance().add_cmd({"panelMsg": "", 'Username' : self.username, 'robot': f'http://{cfg.fay_url}:5000/robot/Normal.jpg'})
            if wsa_server.get_instance().is_connected(self.username):
                content = {'Topic': 'Unreal', 'Data': {'Key': 'log', 'Value': ""}, 'Username' : self.username, 'robot': f'http://{cfg.fay_url}:5000/robot/Normal.jpg'}
                wsa_server.get_instance().add_cmd(content)

    def __record(self):
        try:
            stream = self.get_stream() #通过此方法的阻塞来让程序往下执行
        except Exception as e:
                print(e)
                util.printInfo(1, self.username, "请检查设备是否有误，再重新启动!")
                return
        isSpeaking = False
        last_mute_time = time.time()
        last_speaking_time = time.time()
        data = None
        concatenated_audio = bytearray()
        audio_data_list = []
        while self.__running:
            try:
                cfg.load_config()
                record = cfg.config['source']['record']
                if not record['enabled'] and not self.is_remote:
                    time.sleep(1)
                    continue
                self.is_reading = True
                data = stream.read(1024, exception_on_overflow=False)
                self.is_reading = False
            except Exception as e:
                data = None
                print(e)
                util.log(1, "请检查录音设备是否有误，再重新启动!")
                self.__running = False
            if not data:
                continue
            #是否可以拾音,不可以就掉弃录音
            can_listen = True
            #没有开唤醒，但面板或数字人正在播音时不能拾音
            if cfg.config['source']['wake_word_enabled'] == False and self.__fay.speaking == True:
                can_listen = False

            #普通唤醒模式已经激活，并且面板或数字人正在输出声音时不能拾音
            if cfg.config['source']['wake_word_enabled'] == True and cfg.config['source']['wake_word_type'] == 'common' and self.wakeup_matched == True and self.__fay.speaking == True:
                can_listen = False

            if can_listen == False:#掉弃录音
                data = None
                continue

            #计算音量是否满足激活拾音
            level = audioop.rms(data, 2)

            # 把激活前缓存拉长，避免“唤醒词”在触发拾音前被漏掉
            # 1024帧@16kHz≈64ms/块，30块≈1.9秒
            if len(self.__history_data) >= 30:  # 保存激活前的音频，以免信息掉失
                self.__history_data.pop(0)
            if len(self.__history_level) >= 500:
                self.__history_level.pop(0)
            self.__history_data.append(data)
            self.__history_level.append(level)
            percentage = level / self.__MAX_LEVEL
            history_percentage = self.__get_history_percentage(30)
            if history_percentage > self.__dynamic_threshold:
                self.__dynamic_threshold += (history_percentage - self.__dynamic_threshold) * 0.0025
            elif history_percentage < self.__dynamic_threshold:
                self.__dynamic_threshold += (history_percentage - self.__dynamic_threshold) * 1


            #激活拾音
            if percentage > self.__dynamic_threshold:
                last_speaking_time = time.time()

                if not self.__processing and not isSpeaking and time.time() - last_mute_time > _ATTACK:
                    isSpeaking = True  #用户正在说话
                    util.printInfo(1, self.username,"聆听中...")
                    if wsa_server.get_web_instance().is_connected(self.username):
                        wsa_server.get_web_instance().add_cmd({"panelMsg": "聆听中...", 'Username' : self.username, 'robot': f'http://{cfg.fay_url}:5000/robot/Listening.jpg'})
                    if wsa_server.get_instance().is_connected(self.username):
                        content = {'Topic': 'Unreal', 'Data': {'Key': 'log', 'Value': "聆听中..."}, 'Username' : self.username, 'robot': f'http://{cfg.fay_url}:5000/robot/Listening.jpg'}
                        wsa_server.get_instance().add_cmd(content)
                    concatenated_audio.clear()
                    self.__aLiNls = self.asrclient()
                    try:
                        task_id = self.__aLiNls.start()
                        while not self.__aLiNls.started:
                            time.sleep(0.01)
                    except Exception as e:
                        print(e)
                        util.printInfo(1, self.username, "aliyun asr 连接受限")
                    for i in range(len(self.__history_data) - 1): #当前data在下面会做发送，这里是发送激活前的音频数据，以免漏掉信息
                        buf = self.__history_data[i]
                        audio_data_list.append(self.__process_audio_data(buf, self.channels))
                        if self.ASRMode == "ali":
                            self.__aLiNls.send(self.__process_audio_data(buf, self.channels).tobytes())
                        else:
                            concatenated_audio.extend(self.__process_audio_data(buf, self.channels).tobytes())
                    self.__history_data.clear()
            else:#结束拾音
                last_mute_time = time.time()
                if isSpeaking:
                    if time.time() - last_speaking_time > _RELEASE: #TODO 更换的vad更靠谱
                        isSpeaking = False
                        self.__aLiNls.end()
                        util.printInfo(1, self.username, "语音处理中...")
                        self.__waitingResult(self.__aLiNls, concatenated_audio)

                        mono_data = self.__concatenate_audio_data(audio_data_list)
                        self.__save_audio_to_wav(mono_data, self.sample_rate, "cache_data/input.wav")
                        audio_data_list = []

            #拾音中
            if isSpeaking:
                audio_data_list.append(self.__process_audio_data(data, self.channels))
                if self.ASRMode == "ali":
                    self.__aLiNls.send(self.__process_audio_data(data, self.channels).tobytes())
                else:
                    concatenated_audio.extend(self.__process_audio_data(data, self.channels).tobytes())

    def __save_audio_to_wav(self, data, sample_rate, filename):
        # 确保数据类型为 int16
        if data.dtype != np.int16:
            data = data.astype(np.int16)

        # 打开 WAV 文件
        with wave.open(filename, 'wb') as wf:
            # 设置音频参数
            n_channels = 1  # 单声道
            sampwidth = 2   # 16 位音频，每个采样点 2 字节
            wf.setnchannels(n_channels)
            wf.setsampwidth(sampwidth)
            wf.setframerate(sample_rate)
            wf.writeframes(data.tobytes())

    def __concatenate_audio_data(self, audio_data_list):
        # 将累积的音频数据块连接起来
        data = np.concatenate(audio_data_list)
        return data

    #转变为单声道np.int16
    def __process_audio_data(self, data, channels):
        data = bytearray(data)
        # 将字节数据转换为 numpy 数组
        data = np.frombuffer(data, dtype=np.int16)
        # 重塑数组，将数据分离成多个声道
        data = np.reshape(data, (-1, channels))
        # 对所有声道的数据进行平均，生成单声道
        mono_data = np.mean(data, axis=1).astype(np.int16)
        return mono_data

    def set_processing(self, processing):
        self.__processing = processing

    def start(self):
        MyThread(target=self.__record).start()

    def stop(self):
        self.__running = False

    @abstractmethod
    def on_speaking(self, text):
        pass

    #TODO Edit by xszyou on 20230113:把流的获取方式封装出来方便实现麦克风录制及网络流等不同的流录制子类
    @abstractmethod
    def get_stream(self):
        pass

    @abstractmethod
    def is_remote(self):
        pass