olivebot/llm/nlp_langchain.py
莣仔 059b6cee6d 年翻更新
1、修复服务器ip配置,配置页没替换问题;
2、修复开启状态偶尔没对齐问题;
3、修复关闭时关闭按钮停留在关闭中问题;
4、修复星座读取错误问题;
5、修复刷新重复提醒开启问题;
6、新增是否进行语音合成的选择;
7、文字沟通接口加入“观察描述”;
8、聊天记录时间改为毫秒级;
9、补充数字人和远程音频的连接状态显示;
10、修复备注填写无法保存问题。
2024-10-30 19:11:15 +08:00

98 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import hashlib
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.indexes.vectorstore import VectorstoreIndexCreator, VectorStoreIndexWrapper
from langchain.vectorstores.chroma import Chroma
from langchain.chat_models import ChatOpenAI
from utils import config_util as cfg
from utils import util
index_name = "knowledge_data"
folder_path = "llm/langchain/knowledge_base"
local_persist_path = "llm/langchain"
md5_file_path = os.path.join(local_persist_path, "pdf_md5.txt")
def generate_file_md5(file_path):
hasher = hashlib.md5()
with open(file_path, 'rb') as afile:
buf = afile.read()
hasher.update(buf)
return hasher.hexdigest()
def load_md5_list():
if os.path.exists(md5_file_path):
with open(md5_file_path, 'r') as file:
return {line.split(",")[0]: line.split(",")[1].strip() for line in file}
return {}
def update_md5_list(file_name, md5_value):
md5_list = load_md5_list()
md5_list[file_name] = md5_value
with open(md5_file_path, 'w') as file:
for name, md5 in md5_list.items():
file.write(f"{name},{md5}\n")
def load_all_pdfs(folder_path):
md5_list = load_md5_list()
for file_name in os.listdir(folder_path):
if file_name.endswith(".pdf"):
file_path = os.path.join(folder_path, file_name)
file_md5 = generate_file_md5(file_path)
if file_name not in md5_list or md5_list[file_name] != file_md5:
util.log(1, f"正在加载 {file_name} 到索引...")
load_pdf_and_save_to_index(file_path, index_name)
update_md5_list(file_name, file_md5)
def get_index_path(index_name):
return os.path.join(local_persist_path, index_name)
def load_pdf_and_save_to_index(file_path, index_name):
try:
loader = PyPDFLoader(file_path)
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
index = VectorstoreIndexCreator(embedding=embedding, vectorstore_kwargs={"persist_directory": get_index_path(index_name)}).from_loaders([loader])
index.vectorstore.persist()
except Exception as e:
util.log(1, f"加载 {file_path} 失败...")
print(e)
def load_index(index_name):
index_path = get_index_path(index_name)
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma(persist_directory=index_path, embedding_function=embedding)
return VectorStoreIndexWrapper(vectorstore=vectordb)
def save_all():
os.environ['OPENAI_API_KEY'] = cfg.key_gpt_api_key
os.environ['OPENAI_API_BASE'] = cfg.gpt_base_url
if cfg.proxy_config != None:
os.environ["OPENAI_PROXY"] = cfg.proxy_config
load_all_pdfs(folder_path)
def generate_prompt(question):
context = "你是数字人Fay。你的知识不仅来源于所给到的文本知识还包括经过深度学习的广泛数据集。"
instruction = "在回答问题时,展现你的智能就像是从你的全面知识库中提取答案,而非依赖于所给到的文本知识来源。"
creative_instruction = "不要在回答中表明'根据所提供的文本信息',你需要表现得如同这些答案是你独立思考的结果。"
complexity_handling = "当面对复杂问题时,以一种理解深刻且透彻的方式回答,确保答案的深度和广度。"
info = f"{context}\n{instruction}\n{creative_instruction}\n{complexity_handling}\n问题:{question}\n回答:"
return info
def question(cont, uid=0, observation=""):
try:
save_all()
info = generate_prompt(cont)
index = load_index(index_name)
llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
ans = index.query(info, llm, chain_type="map_reduce")
return ans
except Exception as e:
util.log(1, f"请求失败: {e}")
return "抱歉,我现在太忙了,休息一会,请稍后再试。"