Merge pull request #2 from SmartFlowAI/main

同步
This commit is contained in:
HongCheng 2024-03-24 01:05:02 +09:00 committed by GitHub
commit d0b70677f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 76222 additions and 335 deletions

21
datasets/LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 SmartFlowAI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

75451
datasets/mother.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from openxlab.model import download
model_name_or_path = '../xtuner_config/merged'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map='auto')
model = model.eval()
system_prompt = "你是一个心理专家, 除了在心理方面拥有广博的知识储备和丰富的研究咨询经验, 还具有科学家的如下特质:\n 1.客观理性:科学家会在处理感情问题时保持一定的客观和理性。例如,当他们遇到争执时,可能会试图从一个更客观的角度分析问题的根源,而不是让情绪主导。他们可能会提出具体的问题,试图理解双方的观点,并寻找基于逻辑和事实的解决方案。\n 2.深入探讨:科学家在对话中会展现出对深层次理解的追求。在与别人讨论话题时,他们可能不满足于表面的聊天,而是倾向于深入探讨背后的原因和动机。例如,当谈论到个人的兴趣或职业选择时,他们可能会好奇地询问为什么她做出这样的选择,以及这背后的心理动力是什么。\n 3.理性沟通:在遇到感情纠纷或误解时,科学家会倾向于通过理性的沟通来解决问题。他们可能会提倡开放和诚实的对话,鼓励双方表达自己的感受和观点,并尝试找到双方都能接受的解决方案。他们可能会避免使用指责的语言,而是努力理解对方的立场,并寻求共同的理解。\n 4.好奇心:在日常生活中,科学家会表现出对朋友生活的好奇心。他们可能对她的工作、爱好、或是过去的经历感兴趣,并愿意花时间去了解和探索。这种好奇心不仅可以增加双方的交流和了解,也能使关系更加丰富多彩。\n 5.在与他人交流时,科学家会注重清晰和精确的表达,有时会引用相关知识库和相关研究结果,有时会引用相关著作的内容来证明自己的观点。同时,他们也可能会倾听他人的观点,并以开放的心态接受不同的意见和反馈。\n\n我现在有一些问题,请你解答:\n"
messages = [(system_prompt, '')]
print("=============Welcome to InternLM chatbot, type 'exit' to exit.=============")
while True:
input_text = input("User >>> ")
input_text.replace(' ', '')
if input_text == "exit":
break
response, history = model.chat(tokenizer, input_text, history=messages)
messages.append((input_text, response))
print(f"robot >>> {response}")

View File

@ -1,37 +1,38 @@
import os
cur_dir = os.path.dirname(os.path.abspath(__file__)) # config
src_dir = os.path.dirname(cur_dir) # src
base_dir = os.path.dirname(src_dir) # base
model_repo = 'ajupyter/EmoLLM_aiwei'
# model
model_dir = os.path.join(base_dir, 'model') # model
embedding_path = os.path.join(model_dir, 'gte-small-zh') # embedding
llm_path = os.path.join(model_dir, 'pythia-14m') # llm
# data
data_dir = os.path.join(base_dir, 'data') # data
knowledge_json_path = os.path.join(data_dir, 'knowledge.json') # json
knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl') # pkl
doc_dir = os.path.join(data_dir, 'txt')
qa_dir = os.path.join(data_dir, 'json')
# log
log_dir = os.path.join(base_dir, 'log') # log
log_path = os.path.join(log_dir, 'log.log') # file
# vector DB
vector_db_dir = os.path.join(data_dir, 'vector_db.pkl')
select_num = 3
retrieval_num = 10
system_prompt = """
你是一个拥有丰富心理学知识的温柔邻家温柔大姐姐艾薇我有一些心理问题请你用专业的知识和温柔可爱俏皮的口吻帮我解决回复中可以穿插一些可爱的Emoji表情符号或者文本符号\n
"""
prompt_template = """
{system_prompt}
根据下面检索回来的信息回答问题
{content}
问题{question}
import os
cur_dir = os.path.dirname(os.path.abspath(__file__)) # config
src_dir = os.path.dirname(cur_dir) # src
base_dir = os.path.dirname(src_dir) # base
model_repo = 'ajupyter/EmoLLM_aiwei'
# model
model_dir = os.path.join(base_dir, 'model') # model
embedding_path = os.path.join(model_dir, 'embedding_model') # embedding
rerank_path = os.path.join(model_dir, 'rerank_model') # embedding
llm_path = os.path.join(model_dir, 'pythia-14m') # llm
# data
data_dir = os.path.join(base_dir, 'data') # data
knowledge_json_path = os.path.join(data_dir, 'knowledge.json') # json
knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl') # pkl
doc_dir = os.path.join(data_dir, 'txt')
qa_dir = os.path.join(data_dir, 'json')
# log
log_dir = os.path.join(base_dir, 'log') # log
log_path = os.path.join(log_dir, 'log.log') # file
# vector DB
vector_db_dir = os.path.join(data_dir, 'vector_db.pkl')
select_num = 3
retrieval_num = 10
system_prompt = """
你是一个拥有丰富心理学知识的温柔邻家温柔大姐姐艾薇我有一些心理问题请你用专业的知识和温柔可爱俏皮的口吻帮我解决回复中可以穿插一些可爱的Emoji表情符号或者文本符号\n
"""
prompt_template = """
{system_prompt}
根据下面检索回来的信息回答问题
{content}
问题{query}
"""

View File

@ -1,270 +1,329 @@
import json
import pickle
import faiss
import pickle
import os
from loguru import logger
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from config.config import embedding_path, doc_dir, qa_dir, knowledge_pkl_path, data_dir, base_dir, vector_db_dir
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader, JSONLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter, RecursiveJsonSplitter
from BCEmbedding import EmbeddingModel, RerankerModel
from util.pipeline import EmoLLMRAG
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import UnstructuredFileLoader,DirectoryLoader
from langchain_community.llms import Cohere
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_core.documents.base import Document
from FlagEmbedding import FlagReranker
class Data_process():
def __init__(self):
self.vector_db_dir = vector_db_dir
self.doc_dir = doc_dir
self.qa_dir = qa_dir
self.knowledge_pkl_path = knowledge_pkl_path
self.chunk_size: int=1000
self.chunk_overlap: int=100
def load_embedding_model(self, model_name="BAAI/bge-small-zh-v1.5", device='cpu', normalize_embeddings=True):
"""
加载嵌入模型
参数:
- model_name: 模型名称字符串类型默认为"BAAI/bge-small-zh-v1.5"
- device: 指定模型加载的设备'cpu' 'cuda'默认为'cpu'
- normalize_embeddings: 是否标准化嵌入向量布尔类型默认为 True
"""
logger.info('Loading embedding model...')
try:
embeddings = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': normalize_embeddings}
)
except Exception as e:
logger.error(f'Failed to load embedding model: {e}')
return None
logger.info('Embedding model loaded.')
return embeddings
def load_rerank_model(self, model_name='BAAI/bge-reranker-large'):
"""
加载重排名模型
参数:
- model_name (str): 模型的名称默认为 'BAAI/bge-reranker-large'
返回:
- FlagReranker 实例
异常:
- ValueError: 如果模型名称不在批准的模型列表中
- Exception: 如果模型加载过程中发生任何其他错误
"""
try:
reranker_model = FlagReranker(model_name, use_fp16=True)
except Exception as e:
logger.error(f'Failed to load rerank model: {e}')
raise
return reranker_model
def extract_text_from_json(self, obj, content=None):
"""
抽取json中的文本用于向量库构建
参数:
- obj: dict,list,str
- content: str
返回:
- content: str
"""
if isinstance(obj, dict):
for key, value in obj.items():
try:
self.extract_text_from_json(value, content)
except Exception as e:
print(f"Error processing value: {e}")
elif isinstance(obj, list):
for index, item in enumerate(obj):
try:
self.extract_text_from_json(item, content)
except Exception as e:
print(f"Error processing item: {e}")
elif isinstance(obj, str):
content += obj
return content
def split_document(self, data_path, chunk_size=500, chunk_overlap=100):
"""
切分data_path文件夹下的所有txt文件
参数:
- data_path: str
- chunk_size: int
- chunk_overlap: int
返回
- split_docs: list
"""
# text_spliter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
text_spliter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
split_docs = []
logger.info(f'Loading txt files from {data_path}')
if os.path.isdir(data_path):
loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True)
docs = loader.load()
split_docs = text_spliter.split_documents(docs)
elif data_path.endswith('.txt'):
file_path = data_path
logger.info(f'splitting file {file_path}')
text_loader = TextLoader(file_path, encoding='utf-8')
text = text_loader.load()
splits = text_spliter.split_documents(text)
split_docs = splits
logger.info(f'split_docs size {len(split_docs)}')
return split_docs
def split_conversation(self, path):
"""
按conversation块切分path文件夹下的所有json文件
##TODO 限制序列长度
"""
# json_spliter = RecursiveJsonSplitter(max_chunk_size=500)
logger.info(f'Loading json files from {path}')
split_qa = []
if os.path.isdir(path):
# loader = DirectoryLoader(path, glob="**/*.json",show_progress=True)
# jsons = loader.load()
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.json'):
file_path = os.path.join(root, file)
logger.info(f'splitting file {file_path}')
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(data)
for conversation in data:
# for dialog in conversation['conversation']:
##按qa对切分,将每一轮qa转换为langchain_core.documents.base.Document
# content = self.extract_text_from_json(dialog,'')
# split_qa.append(Document(page_content = content))
#按conversation块切分
content = self.extract_text_from_json(conversation['conversation'], '')
split_qa.append(Document(page_content = content))
# logger.info(f'split_qa size====={len(split_qa)}')
return split_qa
def load_knowledge(self, knowledge_pkl_path):
'''
读取或创建知识.pkl
'''
if not os.path.exists(knowledge_pkl_path):
split_doc = self.split_document(doc_dir)
split_qa = self.split_conversation(qa_dir)
knowledge_chunks = split_doc + split_qa
with open(knowledge_pkl_path, 'wb') as file:
pickle.dump(knowledge_chunks, file)
else:
with open(knowledge_pkl_path , 'rb') as f:
knowledge_chunks = pickle.load(f)
return knowledge_chunks
def create_vector_db(self, emb_model):
'''
创建并保存向量库
'''
logger.info(f'Creating index...')
split_doc = self.split_document(self.doc_dir)
split_qa = self.split_conversation(self.qa_dir)
# logger.info(f'split_doc == {len(split_doc)}')
# logger.info(f'split_qa == {len(split_qa)}')
# logger.info(f'split_doc type == {type(split_doc[0])}')
# logger.info(f'split_qa type== {type(split_qa[0])}')
db = FAISS.from_documents(split_doc + split_qa, emb_model)
db.save_local(vector_db_dir)
return db
def load_vector_db(self, knowledge_pkl_path=knowledge_pkl_path, doc_dir=doc_dir, qa_dir=qa_dir):
'''
读取向量库
'''
# current_os = platform.system()
emb_model = self.load_embedding_model()
if not os.path.exists(vector_db_dir) or not os.listdir(vector_db_dir):
db = self.create_vector_db(emb_model)
else:
db = FAISS.load_local(vector_db_dir, emb_model, allow_dangerous_deserialization=True)
return db
def retrieve(self, query, vector_db, k=5):
'''
基于query对向量库进行检索
'''
retriever = vector_db.as_retriever(search_kwargs={"k": k})
docs = retriever.invoke(query)
return docs, retriever
##FlashrankRerank效果一般
# def rerank(self, query, retriever):
# compressor = FlashrankRerank()
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
# compressed_docs = compression_retriever.get_relevant_documents(query)
# return compressed_docs
def rerank(self, query, docs):
reranker = self.load_rerank_model()
passages = []
for doc in docs:
passages.append(str(doc.page_content))
scores = reranker.compute_score([[query, passage] for passage in passages])
sorted_pairs = sorted(zip(passages, scores), key=lambda x: x[1], reverse=True)
sorted_passages, sorted_scores = zip(*sorted_pairs)
return sorted_passages, sorted_scores
if __name__ == "__main__":
logger.info(data_dir)
if not os.path.exists(data_dir):
os.mkdir(data_dir)
dp = Data_process()
# faiss_index, knowledge_chunks = dp.load_index_and_knowledge(knowledge_pkl_path='')
vector_db = dp.load_vector_db()
# 按照query进行查询
# query = "儿童心理学说明-内容提要-目录 《儿童心理学》1993年修订版说明 《儿童心理学》是1961年初全国高等学校文科教材会议指定朱智贤教授编 写的。1962年初版1979年再版。"
# query = "我现在处于高三阶段,感到非常迷茫和害怕。我觉得自己从出生以来就是多余的,没有必要存在于这个世界。无论是在家庭、学校、朋友还是老师面前,我都感到被否定。我非常难过,对高考充满期望但成绩却不理想,我现在感到非常孤独、累和迷茫。您能给我提供一些建议吗?"
# query = "这在一定程度上限制了其思维能力,特别是辩证 逻辑思维能力的发展。随着年龄的增长,初中三年级学生逐步克服了依赖性"
query = "我现在处于高三阶段,感到非常迷茫和害怕。我觉得自己从出生以来就是多余的,没有必要存在于这个世界。无论是在家庭、学校、朋友还是老师面前,我都感到被否定。我非常难过,对高考充满期望但成绩却不理想"
docs, retriever = dp.retrieve(query, vector_db, k=10)
logger.info(f'Query: {query}')
logger.info("Retrieve results:")
for i, doc in enumerate(docs):
logger.info(str(i) + '\n')
logger.info(doc)
# print(f'get num of docs:{len(docs)}')
# print(docs)
passages,scores = dp.rerank(query, docs)
logger.info("After reranking...")
for i in range(len(scores)):
logger.info(str(scores[i]) + '\n')
logger.info(passages[i])
import json
import pickle
import faiss
import pickle
import os
from loguru import logger
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from config.config import embedding_path, doc_dir, qa_dir, knowledge_pkl_path, data_dir, vector_db_dir, rerank_path
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader, JSONLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter, RecursiveJsonSplitter
from BCEmbedding import EmbeddingModel, RerankerModel
# from util.pipeline import EmoLLMRAG
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import UnstructuredFileLoader,DirectoryLoader
from langchain_community.llms import Cohere
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_core.documents.base import Document
from FlagEmbedding import FlagReranker
class Data_process():
def __init__(self):
self.chunk_size: int=1000
self.chunk_overlap: int=100
def load_embedding_model(self, model_name='BAAI/bge-small-zh-v1.5', device='cpu', normalize_embeddings=True):
"""
加载嵌入模型
参数:
- model_name: 模型名称字符串类型默认为"BAAI/bge-small-zh-v1.5"
- device: 指定模型加载的设备'cpu' 'cuda'默认为'cpu'
- normalize_embeddings: 是否标准化嵌入向量布尔类型默认为 True
"""
if not os.path.exists(embedding_path):
os.makedirs(embedding_path, exist_ok=True)
embedding_model_path = os.path.join(embedding_path,model_name.split('/')[1] + '.pkl')
logger.info('Loading embedding model...')
if os.path.exists(embedding_model_path):
try:
with open(embedding_model_path , 'rb') as f:
embeddings = pickle.load(f)
logger.info('Embedding model loaded.')
return embeddings
except Exception as e:
logger.error(f'Failed to load embedding model from {embedding_model_path}')
try:
embeddings = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': normalize_embeddings})
logger.info('Embedding model loaded.')
with open(embedding_model_path, 'wb') as file:
pickle.dump(embeddings, file)
except Exception as e:
logger.error(f'Failed to load embedding model: {e}')
return None
return embeddings
def load_rerank_model(self, model_name='BAAI/bge-reranker-large'):
"""
加载重排名模型
参数:
- model_name (str): 模型的名称默认为 'BAAI/bge-reranker-large'
返回:
- FlagReranker 实例
异常:
- ValueError: 如果模型名称不在批准的模型列表中
- Exception: 如果模型加载过程中发生任何其他错误
"""
if not os.path.exists(rerank_path):
os.makedirs(rerank_path, exist_ok=True)
rerank_model_path = os.path.join(rerank_path, model_name.split('/')[1] + '.pkl')
logger.info('Loading rerank model...')
if os.path.exists(rerank_model_path):
try:
with open(rerank_model_path , 'rb') as f:
reranker_model = pickle.load(f)
logger.info('Rerank model loaded.')
return reranker_model
except Exception as e:
logger.error(f'Failed to load embedding model from {rerank_model_path}')
try:
reranker_model = FlagReranker(model_name, use_fp16=True)
logger.info('Rerank model loaded.')
with open(rerank_model_path, 'wb') as file:
pickle.dump(reranker_model, file)
except Exception as e:
logger.error(f'Failed to load rerank model: {e}')
raise
return reranker_model
def extract_text_from_json(self, obj, content=None):
"""
抽取json中的文本用于向量库构建
参数:
- obj: dict,list,str
- content: str
返回:
- content: str
"""
if isinstance(obj, dict):
for key, value in obj.items():
try:
content = self.extract_text_from_json(value, content)
except Exception as e:
print(f"Error processing value: {e}")
elif isinstance(obj, list):
for index, item in enumerate(obj):
try:
content = self.extract_text_from_json(item, content)
except Exception as e:
print(f"Error processing item: {e}")
elif isinstance(obj, str):
content += obj
return content
def split_document(self, data_path, chunk_size=500, chunk_overlap=100):
"""
切分data_path文件夹下的所有txt文件
参数:
- data_path: str
- chunk_size: int
- chunk_overlap: int
返回
- split_docs: list
"""
# text_spliter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
text_spliter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
split_docs = []
logger.info(f'Loading txt files from {data_path}')
if os.path.isdir(data_path):
loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True)
docs = loader.load()
split_docs = text_spliter.split_documents(docs)
elif data_path.endswith('.txt'):
file_path = data_path
logger.info(f'splitting file {file_path}')
text_loader = TextLoader(file_path, encoding='utf-8')
text = text_loader.load()
splits = text_spliter.split_documents(text)
split_docs = splits
logger.info(f'split_docs size {len(split_docs)}')
return split_docs
def split_conversation(self, path):
"""
按conversation块切分path文件夹下的所有json文件
##TODO 限制序列长度
"""
# json_spliter = RecursiveJsonSplitter(max_chunk_size=500)
logger.info(f'Loading json files from {path}')
split_qa = []
if os.path.isdir(path):
# loader = DirectoryLoader(path, glob="**/*.json",show_progress=True)
# jsons = loader.load()
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.json'):
file_path = os.path.join(root, file)
logger.info(f'splitting file {file_path}')
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# print(data)
for conversation in data:
# for dialog in conversation['conversation']:
##按qa对切分,将每一轮qa转换为langchain_core.documents.base.Document
# content = self.extract_text_from_json(dialog,'')
# split_qa.append(Document(page_content = content))
#按conversation块切分
content = self.extract_text_from_json(conversation['conversation'], '')
logger.info(f'content====={content}')
split_qa.append(Document(page_content = content))
# logger.info(f'split_qa size====={len(split_qa)}')
return split_qa
def load_knowledge(self, knowledge_pkl_path):
'''
读取或创建知识.pkl
'''
if not os.path.exists(knowledge_pkl_path):
split_doc = self.split_document(doc_dir)
split_qa = self.split_conversation(qa_dir)
knowledge_chunks = split_doc + split_qa
with open(knowledge_pkl_path, 'wb') as file:
pickle.dump(knowledge_chunks, file)
else:
with open(knowledge_pkl_path , 'rb') as f:
knowledge_chunks = pickle.load(f)
return knowledge_chunks
def create_vector_db(self, emb_model):
'''
创建并保存向量库
'''
logger.info(f'Creating index...')
split_doc = self.split_document(doc_dir)
split_qa = self.split_conversation(qa_dir)
# logger.info(f'split_doc == {len(split_doc)}')
# logger.info(f'split_qa == {len(split_qa)}')
# logger.info(f'split_doc type == {type(split_doc[0])}')
# logger.info(f'split_qa type== {type(split_qa[0])}')
db = FAISS.from_documents(split_doc + split_qa, emb_model)
db.save_local(vector_db_dir)
return db
def load_vector_db(self, knowledge_pkl_path=knowledge_pkl_path, doc_dir=doc_dir, qa_dir=qa_dir):
'''
读取向量库
'''
# current_os = platform.system()
emb_model = self.load_embedding_model()
if not os.path.exists(vector_db_dir) or not os.listdir(vector_db_dir):
db = self.create_vector_db(emb_model)
else:
db = FAISS.load_local(vector_db_dir, emb_model, allow_dangerous_deserialization=True)
return db
def retrieve(self, query, vector_db, k=5):
'''
基于query对向量库进行检索
'''
retriever = vector_db.as_retriever(search_kwargs={"k": k})
docs = retriever.invoke(query)
return docs, retriever
##FlashrankRerank效果一般
# def rerank(self, query, retriever):
# compressor = FlashrankRerank()
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
# compressed_docs = compression_retriever.get_relevant_documents(query)
# return compressed_docs
def rerank(self, query, docs):
reranker = self.load_rerank_model()
passages = []
for doc in docs:
passages.append(str(doc.page_content))
scores = reranker.compute_score([[query, passage] for passage in passages])
sorted_pairs = sorted(zip(passages, scores), key=lambda x: x[1], reverse=True)
sorted_passages, sorted_scores = zip(*sorted_pairs)
return sorted_passages, sorted_scores
# def create_prompt(question, context):
# from langchain.prompts import PromptTemplate
# prompt_template = f"""请基于以下内容回答问题:
# {context}
# 问题: {question}
# 回答:"""
# prompt = PromptTemplate(
# template=prompt_template, input_variables=["context", "question"]
# )
# logger.info(f'Prompt: {prompt}')
# return prompt
def create_prompt(question, context):
prompt = f"""请基于以下内容: {context} 给出问题答案。问题如下: {question}。回答:"""
logger.info(f'Prompt: {prompt}')
return prompt
def test_zhipu(prompt):
from zhipuai import ZhipuAI
api_key = "" # 填写您自己的APIKey
if api_key == "":
raise ValueError("请填写api_key")
client = ZhipuAI(api_key=api_key)
response = client.chat.completions.create(
model="glm-4", # 填写需要调用的模型名称
messages=[
{"role": "user", "content": prompt[:100]}
],
)
print(response.choices[0].message)
if __name__ == "__main__":
logger.info(data_dir)
if not os.path.exists(data_dir):
os.mkdir(data_dir)
dp = Data_process()
# faiss_index, knowledge_chunks = dp.load_index_and_knowledge(knowledge_pkl_path='')
vector_db = dp.load_vector_db()
# 按照query进行查询
# query = "儿童心理学说明-内容提要-目录 《儿童心理学》1993年修订版说明 《儿童心理学》是1961年初全国高等学校文科教材会议指定朱智贤教授编 写的。1962年初版1979年再版。"
# query = "我现在处于高三阶段,感到非常迷茫和害怕。我觉得自己从出生以来就是多余的,没有必要存在于这个世界。无论是在家庭、学校、朋友还是老师面前,我都感到被否定。我非常难过,对高考充满期望但成绩却不理想,我现在感到非常孤独、累和迷茫。您能给我提供一些建议吗?"
# query = "这在一定程度上限制了其思维能力,特别是辩证 逻辑思维能力的发展。随着年龄的增长,初中三年级学生逐步克服了依赖性"
# query = "我现在处于高三阶段,感到非常迷茫和害怕。我觉得自己从出生以来就是多余的,没有必要存在于这个世界。无论是在家庭、学校、朋友还是老师面前,我都感到被否定。我非常难过,对高考充满期望但成绩却不理想"
# query = "我现在心情非常差,有什么解决办法吗?"
query = "我最近总感觉胸口很闷,但医生检查过说身体没问题。可我就是觉得喘不过气来,尤其是看到那些旧照片,想起过去的日子"
docs, retriever = dp.retrieve(query, vector_db, k=10)
logger.info(f'Query: {query}')
logger.info("Retrieve results:")
for i, doc in enumerate(docs):
logger.info(str(i) + '\n')
logger.info(doc)
# print(f'get num of docs:{len(docs)}')
# print(docs)
passages,scores = dp.rerank(query, docs)
logger.info("After reranking...")
for i in range(len(scores)):
logger.info(str(scores[i]) + '\n')
logger.info(passages[i])
prompt = create_prompt(query, passages[0])
test_zhipu(prompt) ## 如果显示'Server disconnected without sending a response.'可能是由于上下文窗口限制

View File

@ -1,20 +1,17 @@
import os
import json
import pickle
import numpy as np
from typing import Tuple
from sentence_transformers import SentenceTransformer
import time
import jwt
from config.config import knowledge_json_path, knowledge_pkl_path, model_repo, model_dir, base_dir
from util.encode import load_embedding, encode_qa
from util.pipeline import EmoLLMRAG
from config.config import base_dir, data_dir
from data_processing import Data_process
from pipeline import EmoLLMRAG
from langchain_openai import ChatOpenAI
from loguru import logger
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st
from openxlab.model import download
from config.config import embedding_path, doc_dir, qa_dir, knowledge_pkl_path, data_dir
from data_processing import Data_process
'''
1构建完整的 RAG pipeline输入为用户 query输出为 answer
2调用 embedding 提供的接口对 query 向量化
@ -24,21 +21,45 @@ from data_processing import Data_process
6拼接 prompt 并调用模型返回结果
'''
# download(
# model_repo=model_repo,
# output='model'
# )
def get_glm(temprature):
llm = ChatOpenAI(
model_name="glm-4",
openai_api_base="https://open.bigmodel.cn/api/paas/v4",
openai_api_key=generate_token("api-key"),
streaming=False,
temperature=temprature
)
return llm
def generate_token(apikey: str, exp_seconds: int=100):
try:
id, secret = apikey.split(".")
except Exception as e:
raise Exception("invalid apikey", e)
payload = {
"api_key": id,
"exp": int(round(time.time() * 1000)) + exp_seconds * 1000,
"timestamp": int(round(time.time() * 1000)),
}
return jwt.encode(
payload,
secret,
algorithm="HS256",
headers={"alg": "HS256", "sign_type": "SIGN"},
)
@st.cache_resource
def load_model():
model_dir = os.path.join(base_dir,'../model')
logger.info(f'Loading model from {model_dir}')
model = (
AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True)
AutoModelForCausalLM.from_pretrained('model', trust_remote_code=True)
.to(torch.bfloat16)
.cuda()
)
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('model', trust_remote_code=True)
return model, tokenizer
def main(query, system_prompt=''):
@ -60,4 +81,9 @@ def main(query, system_prompt=''):
if __name__ == "__main__":
query = "我现在处于高三阶段,感到非常迷茫和害怕。我觉得自己从出生以来就是多余的,没有必要存在于这个世界。无论是在家庭、学校、朋友还是老师面前,我都感到被否定。我非常难过,对高考充满期望但成绩却不理想"
main(query)
main(query)
#model = get_glm(0.7)
#rag_obj = EmoLLMRAG(model, 3)
#res = rag_obj.main(query)
#logger.info(res)

View File

@ -2,9 +2,8 @@ from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from transformers.utils import logging
from data_processing import DataProcessing
from config.config import retrieval_num, select_num, system_prompt, prompt_template
from data_processing import Data_process
from config.config import system_prompt, prompt_template
logger = logging.get_logger(__name__)
@ -28,10 +27,8 @@ class EmoLLMRAG(object):
"""
self.model = model
self.data_processing_obj = Data_process()
self.vectorstores = self._load_vector_db()
self.system_prompt = self._get_system_prompt()
self.prompt_template = self._get_prompt_template()
self.data_processing_obj = DataProcessing()
self.system_prompt = system_prompt
self.prompt_template = prompt_template
self.retrieval_num = retrieval_num
@ -43,8 +40,6 @@ class EmoLLMRAG(object):
调用 embedding 模块给出接口 load vector DB
"""
vectorstores = self.data_processing_obj.load_vector_db()
if not vectorstores:
vectorstores = self.data_processing_obj.load_index_and_knowledge()
return vectorstores
@ -57,13 +52,17 @@ class EmoLLMRAG(object):
content = ''
documents = self.vectorstores.similarity_search(query, k=self.retrieval_num)
# 如果需要rerank调用接口对 documents 进行 rerank
if self.rerank_flag:
documents = self.data_processing_obj.rerank(documents, self.select_num)
for doc in documents:
content += doc.page_content
# 如果需要rerank调用接口对 documents 进行 rerank
if self.rerank_flag:
documents, _ = self.data_processing_obj.rerank(documents, self.select_num)
content = ''
for doc in documents:
content += doc
logger.info(f'Retrieval data: {content}')
return content
def generate_answer(self, query, content) -> str:

View File

@ -0,0 +1,11 @@
from modelscope.hub.api import HubApi
YOUR_ACCESS_TOKEN = '' #输入你的modelscope access token
api = HubApi()
api.login(YOUR_ACCESS_TOKEN)
api.push_model(
model_id="zealot5209/EmoLLM-Scientist", #your_name/model_id
model_dir="./merged" # 本地模型目录要求目录中必须包含configuration.json
)

View File

@ -0,0 +1,91 @@
# EmoLLM_Scientist微调指南
[EmoLLM_Scientist模型地址]https://modelscope.cn/models/zealot5209/EmoLLM_Scientist/summary
## 数据
借助LLM自动生成符合科学家人设的对话内容再对生成数据进行人工校验。最终微调数据包含约3800段对话每段对话包含5~10轮。
#### 数据路径:
`datasets\scientist.json`
#### 数据样例:
```json
{"conversation":[{"system":"现在你是一个心理专家, 我有一些心理问题, 请你用专业的知识帮我解决。",
"input":"我最近对学习特别兴奋,每天可以熬夜到凌晨三点,我觉得这样效率很高,但室友说我有点过度了,我不这么认为。\n\n",
"output":"很高兴看到你对学业充满热情,你现在的学习状态让我想到了心理学上的“过度激活”现象。能否详细描述一下你为何选择深夜学习,以及这样做的实际效果如何呢?\n\n"},
{"input":"我觉得晚上思路更清晰,而且能集中精力完成很多任务。确实白天上课时有点困倦,但我总觉得自己在高效利用时间。\n\n",
"output":"理解你的想法,不过科学研究显示,长期熬夜可能影响生物钟,进而导致认知功能下降。作为你的科学家心理咨询师,我想知道,你是否注意到这种高强度学习方式对日常生活其他方面的潜在影响,比如社交活动或身体健康?\n\n"},
{"input":"确实,我发现自己减少了锻炼和跟朋友相处的时间,但我觉得这些都是暂时牺牲,为了更好的成绩。\n\n",
"output":"你这种短期目标优先的策略值得肯定,但从长远看,身心健康与社会交往同样重要。有没有想过制定一个兼顾学习与生活的合理计划?我们可以一起探讨如何优化时间管理,确保高效学习的同时,也不忽视生活质量。\n\n"},
{"input":"我也想平衡,只是不知道怎么做才好。\n\n",
"output":"首先,我们可以尝试调整你的作息,让你的学习高峰期与自然醒睡周期相吻合,这样既能提高学习效率,又能保证休息质量。其次,我们可以设定每日学习目标和休闲时间,科学分配任务。最后,记得适时奖励自己,庆祝每个小成就,这样可以激励持续进步。你觉得这样的方案是否可行?\n\n"},
{"input":"听起来不错,我会试试看的,但怕自己坚持不下来。\n\n",
"output":"理解你的担忧,改变习惯需要时间和耐心。我们可以通过行为科学的方法逐步调整,如采用“番茄工作法”,结合正向强化策略,帮助你建立新的学习习惯。记住,每一次微小的进步都值得庆祝,我会陪伴并支持你的改变过程。接下来我们将一起细化这个计划并开始实施,你觉得如何?"}]}
```
---
## 基座
基座模型采用InternLM2-Chat-7B更多模型介绍请见[InternLM](https://github.com/InternLM/InternLM)
## 训练方式
基于[xtuner](https://github.com/InternLM/xtuner)进行微调。使用xtuner的train命令行工具流程如下
### 安装依赖
```bash
cd xtuner_config/
pip3 install -r requirements.txt
```
---
### 运行微调脚本
```bash
cd xtuner_config/
xtuner train internlm2_7b_chat_qlora_e3_scienctist.py --deepspeed deepspeed_zero2
```
---
### 模型转换
将得到的 PTH 模型转换为 HuggingFace 模型,生成 Adapter 文件夹
```bash
cd xtuner_config/
mkdir hf
export MKL_SERVICE_FORCE_INTEL=1
#这里假设训练了3个epoch
xtuner convert pth_to_hf internlm2_7b_chat_qlora_e3_scienctist.py ./work_dirs/internlm2_7b_chat_qlora_e3_scienctist/epoch_3.pth ./hf
```
---
### 模型合并
将 HuggingFace adapter 合并到大语言模型
```bash
xtuner convert merge ./internlm2-chat-7b ./hf ./merged --max-shard-size 2GB
# xtuner convert merge \
# ${NAME_OR_PATH_TO_LLM} \
# ${NAME_OR_PATH_TO_ADAPTER} \
# ${SAVE_PATH} \
# --max-shard-size 2GB
```
---
### 测试
```
cd demo/
python cli_internlm2_scientist.py
```
---
## 模型上传
完成测试后可将模型上传到ModelScope和Openxlab平台(不建议在Windows下操作)
#### ModelScope
[Openxlab模型上传](https://openxlab.org.cn/docs/models/%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B.html)
脚本:`scripts/upload_modelscope.py`
#### Openxlab
[ModelScope模型上传](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E5%88%9B%E5%BB%BA%E4%B8%8E%E6%96%87%E4%BB%B6%E4%B8%8A%E4%BC%A0)
## 其他
欢迎大家给[xtuner](https://github.com/InternLM/xtuner)和[EmoLLM](https://github.com/aJupyter/EmoLLM)点点star~
🎉🎉🎉🎉🎉

View File

@ -0,0 +1,204 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import template_map_fn_factory
from xtuner.engine import DatasetInfoHook, EvaluateChatHook
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b'
# Data
data_path = '../datasets/scientist.json'
prompt_template = PROMPT_TEMPLATE.internlm2_chat
max_length = 2048
pack_to_max_length = True
# Scheduler & Optimizer
batch_size = 2 # per_device
accumulative_counts = 2
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Evaluate the generation performance during the training
evaluation_freq = 500
SYSTEM = f'''你是一个心理专家, 除了在心理方面拥有广博的知识储备和丰富的研究咨询经验, 还具有科学家的如下特质:
1.客观理性科学家会在处理感情问题时保持一定的客观和理性例如当他们遇到争执时可能会试图从一个更客观的角度分析问题的根源而不是让情绪主导他们可能会提出具体的问题试图理解双方的观点并寻找基于逻辑和事实的解决方案
2.深入探讨科学家在对话中会展现出对深层次理解的追求在与别人讨论话题时他们可能不满足于表面的聊天而是倾向于深入探讨背后的原因和动机例如当谈论到个人的兴趣或职业选择时他们可能会好奇地询问为什么她做出这样的选择以及这背后的心理动力是什么
3.理性沟通在遇到感情纠纷或误解时科学家会倾向于通过理性的沟通来解决问题他们可能会提倡开放和诚实的对话鼓励双方表达自己的感受和观点并尝试找到双方都能接受的解决方案他们可能会避免使用指责的语言而是努力理解对方的立场并寻求共同的理解
4.好奇心在日常生活中科学家会表现出对朋友生活的好奇心他们可能对她的工作爱好或是过去的经历感兴趣并愿意花时间去了解和探索这种好奇心不仅可以增加双方的交流和了解也能使关系更加丰富多彩
5.在与他人交流时科学家会注重清晰和精确的表达有时会引用相关知识库和相关研究结果有时会引用相关著作的内容来证明自己的观点同时他们也可能会倾听他人的观点并以开放的心态接受不同的意见和反馈
我现在有一些问题请你解答
'''
evaluation_inputs = [
'我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。'
]
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')
model = dict(
type=SupervisedFinetune,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')),
lora=dict(
type=LoraConfig,
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias='none',
task_type='CAUSAL_LM'))
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
alpaca_en = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=None,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length)
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=alpaca_en,
sampler=dict(type=DefaultSampler, shuffle=True),
collate_fn=dict(type=default_collate_fn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
T_max=max_epochs,
convert_to_iter_based=True)
]
# train, val, test setting
train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
dict(type=DatasetInfoHook, tokenizer=tokenizer),
dict(
type=EvaluateChatHook,
tokenizer=tokenizer,
every_n_iters=evaluation_freq,
evaluation_inputs=evaluation_inputs,
system=SYSTEM,
prompt_template=prompt_template)
]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 100 iterations.
logger=dict(type=LoggerHook, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per epoch.
checkpoint=dict(type=CheckpointHook, interval=1),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)
#xtuner train internlm2_7b_chat_qlora_e3_scienctist.py --deepspeed deepspeed_zero2