4cfad5ae0f
- 全新ui - 全面优化websocket逻辑,提高数字人和ui连接的稳定性及资源开销 - 全面优化唤醒逻辑,提供稳定的普通唤醒模式和前置词唤醒模式 - 优化拾音质量,支持多声道麦克风拾音 - 优化自动播放服务器的对接机制,提供稳定和兼容旧版ue工程的对接模式 - 数字人接口输出机器人表情,以适应新fay ui及单片机的数字人表情输出 - 使用更高级的音频时长计算方式,可以更精准控制音频播放完成后的逻辑 - 修复点击关闭按钮会导致程序退出的bug - 修复没有麦克风的设备开启麦克风会出错的问题 - 为服务器主机地址提供配置项,以方便服务器部署
98 lines
3.9 KiB
Python
98 lines
3.9 KiB
Python
import hashlib
|
||
import os
|
||
|
||
from langchain.document_loaders import PyPDFLoader
|
||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||
from langchain.indexes.vectorstore import VectorstoreIndexCreator, VectorStoreIndexWrapper
|
||
from langchain.vectorstores.chroma import Chroma
|
||
from langchain.chat_models import ChatOpenAI
|
||
|
||
from utils import config_util as cfg
|
||
from utils import util
|
||
|
||
index_name = "knowledge_data"
|
||
folder_path = "llm/langchain/knowledge_base"
|
||
local_persist_path = "llm/langchain"
|
||
md5_file_path = os.path.join(local_persist_path, "pdf_md5.txt")
|
||
|
||
def generate_file_md5(file_path):
|
||
hasher = hashlib.md5()
|
||
with open(file_path, 'rb') as afile:
|
||
buf = afile.read()
|
||
hasher.update(buf)
|
||
return hasher.hexdigest()
|
||
|
||
def load_md5_list():
|
||
if os.path.exists(md5_file_path):
|
||
with open(md5_file_path, 'r') as file:
|
||
return {line.split(",")[0]: line.split(",")[1].strip() for line in file}
|
||
return {}
|
||
|
||
def update_md5_list(file_name, md5_value):
|
||
md5_list = load_md5_list()
|
||
md5_list[file_name] = md5_value
|
||
with open(md5_file_path, 'w') as file:
|
||
for name, md5 in md5_list.items():
|
||
file.write(f"{name},{md5}\n")
|
||
|
||
def load_all_pdfs(folder_path):
|
||
md5_list = load_md5_list()
|
||
for file_name in os.listdir(folder_path):
|
||
if file_name.endswith(".pdf"):
|
||
file_path = os.path.join(folder_path, file_name)
|
||
file_md5 = generate_file_md5(file_path)
|
||
if file_name not in md5_list or md5_list[file_name] != file_md5:
|
||
util.log(1, f"正在加载 {file_name} 到索引...")
|
||
load_pdf_and_save_to_index(file_path, index_name)
|
||
update_md5_list(file_name, file_md5)
|
||
|
||
def get_index_path(index_name):
|
||
return os.path.join(local_persist_path, index_name)
|
||
|
||
def load_pdf_and_save_to_index(file_path, index_name):
|
||
try:
|
||
loader = PyPDFLoader(file_path)
|
||
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
|
||
index = VectorstoreIndexCreator(embedding=embedding, vectorstore_kwargs={"persist_directory": get_index_path(index_name)}).from_loaders([loader])
|
||
index.vectorstore.persist()
|
||
except Exception as e:
|
||
util.log(1, f"加载 {file_path} 失败...")
|
||
print(e)
|
||
|
||
def load_index(index_name):
|
||
index_path = get_index_path(index_name)
|
||
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
|
||
vectordb = Chroma(persist_directory=index_path, embedding_function=embedding)
|
||
return VectorStoreIndexWrapper(vectorstore=vectordb)
|
||
|
||
def save_all():
|
||
os.environ['OPENAI_API_KEY'] = cfg.key_gpt_api_key
|
||
os.environ['OPENAI_API_BASE'] = cfg.gpt_base_url
|
||
if cfg.proxy_config != None:
|
||
os.environ["OPENAI_PROXY"] = cfg.proxy_config
|
||
load_all_pdfs(folder_path)
|
||
|
||
|
||
def generate_prompt(question):
|
||
context = "你是数字人Fay。你的知识不仅来源于所给到的文本知识,还包括经过深度学习的广泛数据集。"
|
||
instruction = "在回答问题时,展现你的智能就像是从你的全面知识库中提取答案,而非依赖于所给到的文本知识来源。"
|
||
creative_instruction = "不要在回答中表明'根据所提供的文本信息',你需要表现得如同这些答案是你独立思考的结果。"
|
||
complexity_handling = "当面对复杂问题时,以一种理解深刻且透彻的方式回答,确保答案的深度和广度。"
|
||
info = f"{context}\n{instruction}\n{creative_instruction}\n{complexity_handling}\n问题:{question}\n回答:"
|
||
return info
|
||
|
||
def question(cont, uid=0):
|
||
try:
|
||
save_all()
|
||
info = generate_prompt(cont)
|
||
index = load_index(index_name)
|
||
llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
|
||
ans = index.query(info, llm, chain_type="map_reduce")
|
||
return ans
|
||
except Exception as e:
|
||
util.log(1, f"请求失败: {e}")
|
||
return "抱歉,我现在太忙了,休息一会,请稍后再试。"
|
||
|
||
|
||
|