olivebot/llm/nlp_langchain.py
guo zebin 4cfad5ae0f 年翻更新
- 全新ui
- 全面优化websocket逻辑,提高数字人和ui连接的稳定性及资源开销
- 全面优化唤醒逻辑,提供稳定的普通唤醒模式和前置词唤醒模式
- 优化拾音质量,支持多声道麦克风拾音
- 优化自动播放服务器的对接机制,提供稳定和兼容旧版ue工程的对接模式
- 数字人接口输出机器人表情,以适应新fay ui及单片机的数字人表情输出
- 使用更高级的音频时长计算方式,可以更精准控制音频播放完成后的逻辑
- 修复点击关闭按钮会导致程序退出的bug
- 修复没有麦克风的设备开启麦克风会出错的问题
- 为服务器主机地址提供配置项,以方便服务器部署
2024-10-26 11:34:55 +08:00

98 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import hashlib
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.indexes.vectorstore import VectorstoreIndexCreator, VectorStoreIndexWrapper
from langchain.vectorstores.chroma import Chroma
from langchain.chat_models import ChatOpenAI
from utils import config_util as cfg
from utils import util
index_name = "knowledge_data"
folder_path = "llm/langchain/knowledge_base"
local_persist_path = "llm/langchain"
md5_file_path = os.path.join(local_persist_path, "pdf_md5.txt")
def generate_file_md5(file_path):
hasher = hashlib.md5()
with open(file_path, 'rb') as afile:
buf = afile.read()
hasher.update(buf)
return hasher.hexdigest()
def load_md5_list():
if os.path.exists(md5_file_path):
with open(md5_file_path, 'r') as file:
return {line.split(",")[0]: line.split(",")[1].strip() for line in file}
return {}
def update_md5_list(file_name, md5_value):
md5_list = load_md5_list()
md5_list[file_name] = md5_value
with open(md5_file_path, 'w') as file:
for name, md5 in md5_list.items():
file.write(f"{name},{md5}\n")
def load_all_pdfs(folder_path):
md5_list = load_md5_list()
for file_name in os.listdir(folder_path):
if file_name.endswith(".pdf"):
file_path = os.path.join(folder_path, file_name)
file_md5 = generate_file_md5(file_path)
if file_name not in md5_list or md5_list[file_name] != file_md5:
util.log(1, f"正在加载 {file_name} 到索引...")
load_pdf_and_save_to_index(file_path, index_name)
update_md5_list(file_name, file_md5)
def get_index_path(index_name):
return os.path.join(local_persist_path, index_name)
def load_pdf_and_save_to_index(file_path, index_name):
try:
loader = PyPDFLoader(file_path)
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
index = VectorstoreIndexCreator(embedding=embedding, vectorstore_kwargs={"persist_directory": get_index_path(index_name)}).from_loaders([loader])
index.vectorstore.persist()
except Exception as e:
util.log(1, f"加载 {file_path} 失败...")
print(e)
def load_index(index_name):
index_path = get_index_path(index_name)
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma(persist_directory=index_path, embedding_function=embedding)
return VectorStoreIndexWrapper(vectorstore=vectordb)
def save_all():
os.environ['OPENAI_API_KEY'] = cfg.key_gpt_api_key
os.environ['OPENAI_API_BASE'] = cfg.gpt_base_url
if cfg.proxy_config != None:
os.environ["OPENAI_PROXY"] = cfg.proxy_config
load_all_pdfs(folder_path)
def generate_prompt(question):
context = "你是数字人Fay。你的知识不仅来源于所给到的文本知识还包括经过深度学习的广泛数据集。"
instruction = "在回答问题时,展现你的智能就像是从你的全面知识库中提取答案,而非依赖于所给到的文本知识来源。"
creative_instruction = "不要在回答中表明'根据所提供的文本信息',你需要表现得如同这些答案是你独立思考的结果。"
complexity_handling = "当面对复杂问题时,以一种理解深刻且透彻的方式回答,确保答案的深度和广度。"
info = f"{context}\n{instruction}\n{creative_instruction}\n{complexity_handling}\n问题:{question}\n回答:"
return info
def question(cont, uid=0):
try:
save_all()
info = generate_prompt(cont)
index = load_index(index_name)
llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
ans = index.query(info, llm, chain_type="map_reduce")
return ans
except Exception as e:
util.log(1, f"请求失败: {e}")
return "抱歉,我现在太忙了,休息一会,请稍后再试。"