[Merge] Dev (#195)

2024-04-15 21:57:16 +08:00 · 2024-04-15 21:57:16 +08:00 · f301087b17
commit f301087b17
parent 27b6d79920 350e3e6008
9 changed files with 1211 additions and 71 deletions
--- a/EmoLLM.ipynb
+++ b/EmoLLM.ipynb
--- a/README.md
+++ b/README.md
@ -253,33 +253,33 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git

 ### 作者（排名不分先后）

-|                            用户名                             |                     学校/组织                      |                                 备注                                 |                     贡献                      |
-| :-----------------------------------------------------------: | :------------------------------------------------: | :------------------------------------------------------------------: | :-------------------------------------------: |
-|            [aJupyter](https://github.com/aJupyter)            |                  南开大学在读硕士                  |                            DataWhale成员                             |                  项目发起人                   |
-|            [MING-ZCH](https://github.com/MING-ZCH)            |               华中科技大学在读本科生               |                       LLM x Psychology 研究者                        |                项目联合负责人                 |
-|          [jujimeizuo](https://github.com/jujimeizuo)          |                  江南大学在读硕士                  |                                                                      |                                               |
-| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |          哈尔滨工业大学（威海）在读本科生          |                                                                      |                                               |
-|              [8baby8](https://github.com/8baby8)              |                 飞桨领航团区域主管                 |                         文心大模型核心开发者                         |                                               |
-|              [zxazys](https://github.com/zxazys)              |                  南开大学在读硕士                  |                                                                      |                                               |
-|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |                       swufe                        |                                                                      |                                               |
-|             [MrCatAI](https://github.com/MrCatAI)             |                      AI搬用工                      |                                                                      |                                               |
-|              [ZeyuBa](https://github.com/ZeyuBa)              |                  自动化所在读硕士                  |                                                                      |                                               |
-|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |               宾夕法尼亚大学在读硕士               |                                                                      |                                               |
-|           [Nobody-ML](https://github.com/Nobody-ML)           |           中国石油大学（华东）在读本科生           |                                                                      |                                               |
-|             [chg0901](https://github.com/chg0901)             | [MiniSora](https://github.com/mini-sora/minisora/) | [MiniSora](https://github.com/mini-sora/minisora/)主要维护者，管理员 | LLM预训练和微调、模型上传、数据清洗、文档翻译 |
-|              [Mxoder](https://github.com/Mxoder)              |             北京航空航天大学在读本科生             |                                                                      |                                               |
-|            [Anooyman](https://github.com/Anooyman)            |                  南京理工大学硕士                  |                                                                      |                                               |
-|          [Vicky-3021](https://github.com/Vicky-3021)          |            西安电子科技大学硕士（研0）             |                                                                      |                                               |
-|         [SantiagoTOP](https://github.com/santiagoTOP)         |                太原理工大学在读硕士                |                                                                      |                                               |
-|         [zealot52099](https://github.com/zealot52099)         |                     个人开发者                     |                                                                      |            清洗数据、LLM微调、RAG             |
-|             [wwwyfff](https://github.com/wwwyfff)             |                  复旦大学在读硕士                  |                                                                      |                                               |
-|            [Yicooong](https://github.com/Yicooong)            |                  南开大学在读硕士                  |                                                                      |                                               |
-|             [jkhumor](https://github.com/jkhumor)             |                  南开大学在读硕士                  |                                                                      |                      RAG                      |
-|        [lll997150986](https://github.com/lll997150986)        |                  南开大学在读硕士                  |                                                                      |                     微调                      |
-|           [nln-maker](https://github.com/nln-maker)           |                  南开大学在读硕士                  |                                                                      |                  前后端开发                   |
-|          [dream00001](https://github.com/dream00001)          |                  南开大学在读硕士                  |                                                                      |                  前后端开发                   |
-|     [王几行XING](https://zhihu.com/people/brycewang1898)      |                  北京大学硕士毕业                  |                                                                      |         清洗数据、LLM微调、前后端开发         |
-|                            [思在]                             |            北京大学硕士毕业（微软美国）            |                                                                      |              LLM微调、前后端开发              |
+|                            用户名                            |                     学校/组织                      |                             备注                             |                     贡献                      |
+| :----------------------------------------------------------: | :------------------------------------------------: | :----------------------------------------------------------: | :-------------------------------------------: |
+|           [aJupyter](https://github.com/aJupyter)            |                  南开大学在读硕士                  |                        DataWhale成员                         |                  项目发起人                   |
+|           [MING-ZCH](https://github.com/MING-ZCH)            |               华中科技大学在读本科生               |                   LLM x Psychology 研究者                    |                项目联合负责人                 |
+|         [jujimeizuo](https://github.com/jujimeizuo)          |                  江南大学在读硕士                  |                                                              |                                               |
+| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |          哈尔滨工业大学（威海）在读本科生          |                                                              |                                               |
+|             [8baby8](https://github.com/8baby8)              |                 飞桨领航团区域主管                 |                     文心大模型核心开发者                     |                                               |
+|             [zxazys](https://github.com/zxazys)              |                  南开大学在读硕士                  |                                                              |                                               |
+|   [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |                       swufe                        |                                                              |                                               |
+|            [MrCatAI](https://github.com/MrCatAI)             |                      AI搬用工                      |                                                              |                                               |
+|             [ZeyuBa](https://github.com/ZeyuBa)              |                  自动化所在读硕士                  |                                                              |                                               |
+|   [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |               宾夕法尼亚大学在读硕士               |                                                              |                                               |
+|          [Nobody-ML](https://github.com/Nobody-ML)           |           中国石油大学（华东）在读本科生           |                                                              |                                               |
+|            [chg0901](https://github.com/chg0901)             | [MiniSora](https://github.com/mini-sora/minisora/) | [MiniSora](https://github.com/mini-sora/minisora/)主要维护者，管理员 | LLM预训练和微调、模型上传、数据清洗、文档翻译 |
+|             [Mxoder](https://github.com/Mxoder)              |             北京航空航天大学在读本科生             |                                                              |                                               |
+|           [Anooyman](https://github.com/Anooyman)            |                  南京理工大学硕士                  |                                                              |                                               |
+|         [Vicky-3021](https://github.com/Vicky-3021)          |            西安电子科技大学硕士（研0）             |                                                              |                                               |
+|        [SantiagoTOP](https://github.com/santiagoTOP)         |                太原理工大学在读硕士                |                                                              |      数据清洗，文档管理、Baby EmoLLM维护      |
+|        [zealot52099](https://github.com/zealot52099)         |                     个人开发者                     |                                                              |            清洗数据、LLM微调、RAG             |
+|            [wwwyfff](https://github.com/wwwyfff)             |                  复旦大学在读硕士                  |                                                              |                                               |
+|           [Yicooong](https://github.com/Yicooong)            |                  南开大学在读硕士                  |                                                              |                                               |
+|            [jkhumor](https://github.com/jkhumor)             |                  南开大学在读硕士                  |                                                              |                      RAG                      |
+|       [lll997150986](https://github.com/lll997150986)        |                  南开大学在读硕士                  |                                                              |                     微调                      |
+|          [nln-maker](https://github.com/nln-maker)           |                  南开大学在读硕士                  |                                                              |                  前后端开发                   |
+|         [dream00001](https://github.com/dream00001)          |                  南开大学在读硕士                  |                                                              |                  前后端开发                   |
+|     [王几行XING](https://zhihu.com/people/brycewang1898)     |                  北京大学硕士毕业                  |                                                              |         清洗数据、LLM微调、前后端开发         |
+|                            [思在]                            |            北京大学硕士毕业（微软美国）            |                                                              |              LLM微调、前后端开发              |

 ### 版权说明

--- a/README_EN.md
+++ b/README_EN.md
@ -253,32 +253,32 @@ This project uses Git for version control. You can see the currently available v

 ### Authors (in no particular order)

-|                           Username                            |                         School/Organization                          |                                  Remarks                                  |                                     Contributions                                     |
-| :-----------------------------------------------------------: | :------------------------------------------------------------------: | :-----------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: |
-|            [aJupyter](https://github.com/aJupyter)            |                 Nankai University, Master's student                  |                             DataWhale member                              |                                   Project initiator                                   |
-|            [MING-ZCH](https://github.com/MING-ZCH)            | Huazhong University of Science and Technology, Undergraduate student |                        LLM X Psychology researcher                        |                                   Project co-leader                                   |
-|          [jujimeizuo](https://github.com/jujimeizuo)          |                Jiangnan University, Master's student                 |                                                                           |                                                                                       |
-| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |    Harbin Institute of Technology (Weihai), Undergraduate student    |                                                                           |                                                                                       |
-|              [8baby8](https://github.com/8baby8)              |              PaddlePaddle Pilot Team Regional Director               |                     Wenxin Large Model core developer                     |                                                                                       |
-|              [zxazys](https://github.com/zxazys)              |                 Nankai University, Master's student                  |                                                                           |                                                                                       |
-|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |       SWUFE (Southwestern University of Finance and Economics)       |                                                                           |                                                                                       |
-|             [MrCatAI](https://github.com/MrCatAI)             |                               AI Mover                               |                                                                           |                                                                                       |
-|              [ZeyuBa](https://github.com/ZeyuBa)              |              Institute of Automation, Master's student               |                                                                           |                                                                                       |
-|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |             University of Pennsylvania, Master's student             |                                                                           |                                                                                       |
-|           [Nobody-ML](https://github.com/Nobody-ML)           |  China University of Petroleum (East China), Undergraduate student   |                                                                           |                                                                                       |
-|             [chg0901](https://github.com/chg0901)             |          [MiniSora](https://github.com/mini-sora/minisora)           | Maintainer and Admin of [MiniSora](https://github.com/mini-sora/minisora) | LLM Pre-Training and Fine-Tuning, Model Uploading, Data Cleaning and Docs Translation |
-|              [Mxoder](https://github.com/Mxoder)              |              Beihang University, Undergraduate student               |                                                                           |                                                                                       |
-|            [Anooyman](https://github.com/Anooyman)            |    Nanjing University of Science and Technology, Master's student    |                                                                           |                                                                                       |
-|          [Vicky-3021](https://github.com/Vicky-3021)          |        Xidian University, Master's student (Research Year 0)         |                                                                           |                                                                                       |
-|         [SantiagoTOP](https://github.com/santiagoTOP)         |          Taiyuan University of Technology, Master's student          |                                                                           |                                                                                       |
-|         [zealot52099](https://github.com/zealot52099)         |                         Individual developer                         |                                                                           |                        Data Processing, LLM finetuning and RAG                        |
-|             [wwwyfff](https://github.com/wwwyfff)             |                  FuDan University, Master's student                  |                                                                           |                                                                                       |
-|             [jkhumor](https://github.com/jkhumor)             |                 Nankai University, Master's student                  |                                                                           |                                          RAG                                          |
-|        [lll997150986](https://github.com/lll997150986)        |                 Nankai University, Master's student                  |                                                                           |                                      Fine Tuning                                      |
-|           [nln-maker](https://github.com/nln-maker)           |                 Nankai University, Master's student                  |                                                                           |                          Front-end and back-end development                           |
-|          [dream00001](https://github.com/dream00001)          |                 Nankai University, Master's student                  |                                                                           |                          Front-end and back-end development                           |
-|         [王几行XING](zhihu.com/people/brycewang1898)          |                 Peking University, Master's graduate                 |                                                                           |          Data Processing, LLM finetuning, Front-end and back-end development          |
-|                            [思在]                             |           Peking University, Master's graduate (Microsoft)           |                                                                           |                  LLM finetuning, Front-end and back-end development                   |
+|                           Username                           |                     School/Organization                      |                           Remarks                            |                        Contributions                         |
+| :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+|           [aJupyter](https://github.com/aJupyter)            |             Nankai University, Master's student              |                       DataWhale member                       |                      Project initiator                       |
+|           [MING-ZCH](https://github.com/MING-ZCH)            | Huazhong University of Science and Technology, Undergraduate student |                 LLM X Psychology researcher                  |                      Project co-leader                       |
+|         [jujimeizuo](https://github.com/jujimeizuo)          |            Jiangnan University, Master's student             |                                                              |                                                              |
+| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student |                                                              |                                                              |
+|             [8baby8](https://github.com/8baby8)              |          PaddlePaddle Pilot Team Regional Director           |              Wenxin Large Model core developer               |                                                              |
+|             [zxazys](https://github.com/zxazys)              |             Nankai University, Master's student              |                                                              |                                                              |
+|   [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |   SWUFE (Southwestern University of Finance and Economics)   |                                                              |                                                              |
+|            [MrCatAI](https://github.com/MrCatAI)             |                           AI Mover                           |                                                              |                                                              |
+|             [ZeyuBa](https://github.com/ZeyuBa)              |          Institute of Automation, Master's student           |                                                              |                                                              |
+|   [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |         University of Pennsylvania, Master's student         |                                                              |                                                              |
+|          [Nobody-ML](https://github.com/Nobody-ML)           | China University of Petroleum (East China), Undergraduate student |                                                              |                                                              |
+|            [chg0901](https://github.com/chg0901)             |      [MiniSora](https://github.com/mini-sora/minisora)       | Maintainer and Admin of [MiniSora](https://github.com/mini-sora/minisora) | LLM Pre-Training and Fine-Tuning, Model Uploading, Data Cleaning and Docs Translation |
+|             [Mxoder](https://github.com/Mxoder)              |          Beihang University, Undergraduate student           |                                                              |                                                              |
+|           [Anooyman](https://github.com/Anooyman)            | Nanjing University of Science and Technology, Master's student |                                                              |                                                              |
+|         [Vicky-3021](https://github.com/Vicky-3021)          |    Xidian University, Master's student (Research Year 0)     |                                                              |                                                              |
+|        [SantiagoTOP](https://github.com/santiagoTOP)         |      Taiyuan University of Technology, Master's student      |                                                              | Data cleansing, document management, Baby EmoLLM maintenance |
+|        [zealot52099](https://github.com/zealot52099)         |                     Individual developer                     |                                                              |           Data Processing, LLM finetuning and RAG            |
+|            [wwwyfff](https://github.com/wwwyfff)             |              FuDan University, Master's student              |                                                              |                                                              |
+|            [jkhumor](https://github.com/jkhumor)             |             Nankai University, Master's student              |                                                              |                             RAG                              |
+|       [lll997150986](https://github.com/lll997150986)        |             Nankai University, Master's student              |                                                              |                         Fine Tuning                          |
+|          [nln-maker](https://github.com/nln-maker)           |             Nankai University, Master's student              |                                                              |              Front-end and back-end development              |
+|         [dream00001](https://github.com/dream00001)          |             Nankai University, Master's student              |                                                              |              Front-end and back-end development              |
+|         [王几行XING](zhihu.com/people/brycewang1898)         |             Peking University, Master's graduate             |                                                              | Data Processing, LLM finetuning, Front-end and back-end development |
+|                            [思在]                            |       Peking University, Master's graduate (Microsoft)       |                                                              |      LLM finetuning, Front-end and back-end development      |

 ### Copyright Notice

--- a/datasets/processed/process_single_turn_conversation_construction.py
+++ b/datasets/processed/process_single_turn_conversation_construction.py
@ -1,9 +1,9 @@
 import json

 # 打开JSON文件并读取其内容
-# file_name = 'single_turn_dataset_1.json' 
-file_name = 'single_turn_dataset_2.json'  
-with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
+file_name = 'single_turn_dataset_1.json'
+# file_name = 'single_turn_dataset_2.json'
+with open(f'E:\LLM\EmoLLM\datasets\\{file_name}', 'rt', encoding='utf-8') as file:
    format1_data = json.load(file)

 system = "你是心理健康助手EmoLLM，由EmoLLM团队打造。你旨在通过专业心理咨询，协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术，一步步帮助来访者解决心理问题。"
--- a/rag/README.md
+++ b/rag/README.md
@ -58,6 +58,14 @@ JSON 数据格式如下
 如果已经有 vector DB 则会直接加载对应数据库


+**注意**: 可以直接从 xlab 下载对应 DB（请在rag文件目录下执行对应 code）
+```python
+# https://openxlab.org.cn/models/detail/Anooyman/EmoLLMRAGTXT/tree/main
+git lfs install
+git clone https://code.openxlab.org.cn/Anooyman/EmoLLMRAGTXT.git
+```
+
+
 ### 配置 config 文件

 根据需要改写 config.config 文件：
--- a/rag/src/config/config.py
+++ b/rag/src/config/config.py
@ -20,6 +20,7 @@ knowledge_json_path = os.path.join(data_dir, 'knowledge.json')      # json
 knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl')        # pkl
 doc_dir = os.path.join(data_dir, 'txt')   
 qa_dir = os.path.join(data_dir, 'json')   
+cloud_vector_db_dir = os.path.join(base_dir, 'EmoLLMRAGTXT')

 # log
 log_dir = os.path.join(base_dir, 'log')                             # log
@ -30,13 +31,13 @@ chunk_size=1000
 chunk_overlap=100

 # vector DB
-vector_db_dir = os.path.join(data_dir, 'vector_db')
+vector_db_dir = os.path.join(cloud_vector_db_dir, 'vector_db')

 # RAG related
 # select num: 代表rerank 之后选取多少个 documents 进入 LLM
 # retrieval num： 代表从 vector db 中检索多少 documents。（retrieval num 应该大于等于 select num）
 select_num = 3
-retrieval_num = 10
+retrieval_num = 3

 # LLM key
 glm_key = ''
--- a/rag/src/data_processing.py
+++ b/rag/src/data_processing.py
@ -4,7 +4,7 @@ import os

 from loguru import logger
 from langchain_community.vectorstores import FAISS
-from config.config import (
+from rag.src.config.config import (
    embedding_path,
    embedding_model_name,
    doc_dir, qa_dir,
@ -19,7 +19,6 @@ from config.config import (
 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.document_loaders import DirectoryLoader, TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain.document_loaders import DirectoryLoader
 from langchain_core.documents.base import Document
 from FlagEmbedding import FlagReranker

@ -199,7 +198,7 @@ class Data_process():
        创建并保存向量库
        '''
        logger.info(f'Creating index...')
-        #split_doc = self.split_document(doc_dir)
+        split_doc = self.split_document(doc_dir)
        split_qa = self.split_conversation(qa_dir)
        # logger.info(f'split_doc == {len(split_doc)}')
        # logger.info(f'split_qa == {len(split_qa)}')
@ -218,7 +217,7 @@ class Data_process():
        if not os.path.exists(vector_db_dir) or not os.listdir(vector_db_dir):
            db = self.create_vector_db(emb_model)
        else:
-            db = FAISS.load_local(vector_db_dir, emb_model, allow_dangerous_deserialization=True)
+            db = FAISS.load_local(vector_db_dir, emb_model)
        return db
    
 if __name__ == "__main__":
--- a/rag/src/pipeline.py
+++ b/rag/src/pipeline.py
@ -2,8 +2,8 @@ from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import PromptTemplate
 from transformers.utils import logging

-from data_processing import Data_process
-from config.config import prompt_template 
+from rag.src.data_processing import Data_process
+from rag.src.config.config import prompt_template 
 logger = logging.get_logger(__name__)


@ -48,19 +48,19 @@ class EmoLLMRAG(object):
            ouput: 检索后并且 rerank 的内容        
        """
    
-        content = ''
+        content = []
        documents = self.vectorstores.similarity_search(query, k=self.retrieval_num)

        for doc in documents:
-            content += doc.page_content
+            content.append(doc.page_content)

        # 如果需要rerank，调用接口对 documents 进行 rerank
        if self.rerank_flag:
            documents, _ = self.data_processing_obj.rerank(documents, self.select_num)

-            content = ''
+            content = []
            for doc in documents:
-                content += doc
+                content.append(doc)
        logger.info(f'Retrieval data: {content}')
        return content
    
--- a/web_internlm2.py
+++ b/web_internlm2.py
@ -12,6 +12,7 @@ import copy
 import os
 import warnings
 from dataclasses import asdict, dataclass
+from rag.src.pipeline import EmoLLMRAG
 from typing import Callable, List, Optional

 import streamlit as st
@ -188,8 +189,9 @@ robot_prompt = "<|im_start|>assistant\n{robot}<|im_end|>\n"
 cur_query_prompt = "<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n"


-def combine_history(prompt):
+def combine_history(prompt, retrieval_content=''):
    messages = st.session_state.messages
+    prompt = f"你需要根据以下从书本中检索到的专业知识:`{retrieval_content}`。从一个心理专家的专业角度来回答后续提问：{prompt}"
    meta_instruction = (
        "你是一个由aJupyter、Farewell、jujimeizuo、Smiling&Weeping研发（排名按字母顺序排序，不分先后）、散步提供技术支持、上海人工智能实验室提供支持开发的心理健康大模型。现在你是一个心理专家，我有一些心理问题，请你用专业的知识帮我解决。"
    )
@ -211,6 +213,7 @@ def main():
    # torch.cuda.empty_cache()
    print("load model begin.")
    model, tokenizer = load_model()
+    rag_obj = EmoLLMRAG(model)
    print("load model end.")

    user_avator = "assets/user.png"
@ -232,9 +235,12 @@ def main():
    # Accept user input
    if prompt := st.chat_input("What is up?"):
        # Display user message in chat message container
+        retrieval_content = rag_obj.get_retrieval_content(prompt)
        with st.chat_message("user", avatar=user_avator):
            st.markdown(prompt)
-        real_prompt = combine_history(prompt)
+            #st.markdown(retrieval_content)
+
+        real_prompt = combine_history(prompt, retrieval_content)
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": prompt, "avatar": user_avator})