+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..fdc5048
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..90a20c0
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/qa_generation/Clean_QA.md b/scripts/qa_generation/Clean_QA.md
new file mode 100644
index 0000000..9e0b6ec
--- /dev/null
+++ b/scripts/qa_generation/Clean_QA.md
@@ -0,0 +1,11 @@
+# 清洗 QA 对
+调用qwen去判断当前QA对是否属于心理学范畴,去除非心理学范畴的 QA 对
+
+## Step 1
+1. 准备好需要清洗的 QA 对数据
+2. 将该数据放进 model 同级 data 文件夹下
+3. 根据文件夹名去修改 config/config.py 中的 judge_dir。我个人没有对文件名进行更改,所以我的judge_dir是 judge_dir = os.path.join(data_dir, '数据整合')
+
+## Step 2
+1. 运行QA_clean.py即可
+2. 清洗完的 QA 对会以 jsonl 的格式存在 data/cleaned 下
\ No newline at end of file
diff --git a/scripts/qa_generation/QA_clean.py b/scripts/qa_generation/QA_clean.py
new file mode 100644
index 0000000..7d3fbc7
--- /dev/null
+++ b/scripts/qa_generation/QA_clean.py
@@ -0,0 +1,105 @@
+import os
+import json
+import time
+from tqdm import tqdm
+import concurrent.futures
+from datetime import datetime
+import numpy as np
+
+from config.config import result_dir, clean_dir, storage_interval, window_size, overlap_size, multi_process_num
+from model.qwen import call_qwen_single_turn, call_qwen_Psychology_QA_Pairs
+from util.logger import get_logger
+from util.data_loader import get_jsonl_file_paths, get_file_list, get_QA_pairs, get_txt_content, capture_qa, merge_sub_qa_generation, save_to_file
+
+logger = get_logger()
+
+
+def single_thread_generate(thread_num, interval, model_caller, storage_jsonl_path, contents):
+
+ storage_counter = 0
+ judge_list = []
+ for content in tqdm(contents):
+ try:
+ response = model_caller(content)
+
+ if response == '1':
+ content = json.loads(content)
+ judge_list.append(content)
+ storage_counter += 1
+ else:
+ continue
+
+ if storage_counter % interval == 0:
+ save_to_file(storage_jsonl_path, judge_list)
+ storage_counter = 0
+ judge_list = []
+
+ except Exception as exc:
+ logger.error("QA generation error : %s" % (exc))
+
+ # 最后,如果 storage_list 中还有剩余内容,也会将其保存到文件中。
+ if judge_list:
+ save_to_file(storage_jsonl_path, judge_list)
+ judge_list = []
+
+
+"""
+生成 QA 对
+model_name: 可调用的模型名称,暂时只实现了 qwen
+interval: 存储间隔,即每隔多少条存一次文件,过密的间隔会增大 IO 开销
+"""
+def clean_qa(
+ model_name: str = 'qwen',
+ interval: int = 10,
+):
+ # current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+ if model_name == 'qwen':
+ model_caller = call_qwen_Psychology_QA_Pairs
+ else:
+ logger.warning('This model is currently not supported and will call the default model - qwen.')
+ model_caller = call_qwen_Psychology_QA_Pairs
+ model_name = 'qwen'
+
+ logger.info(f'The called model is: {model_name}.')
+ logger.info(f'The storage interval is: {interval}.')
+
+ file_lists = get_jsonl_file_paths() # 数据整合文件夹下所有.jsonl文件的地址
+
+ for file_path in file_lists:
+ # 一个jsonl文件的所有QA Pairs
+ contents = get_QA_pairs(file_path)
+
+ file_name = os.path.basename(file_path)
+ print(file_name)
+ storage_jsonl_path = os.path.join(
+ clean_dir, f'{file_name}')
+
+ logger.info(f'The generated QA will be stored in {storage_jsonl_path}.')
+
+ contents_array = np.array(contents)
+ chunks = np.array_split(contents_array, multi_process_num)
+
+ # 构建并发参数 list
+ parameters_list = list()
+ for thread_num, chunk in enumerate(chunks):
+ parameters_list.append(
+ [thread_num, interval, model_caller, storage_jsonl_path, list(chunk)]
+ )
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=multi_process_num) as executor:
+ futures = [executor.submit(single_thread_generate, *parameters) for parameters in parameters_list]
+
+ for future in concurrent.futures.as_completed(futures):
+ try:
+ future.result()
+ except Exception as exc:
+ logger.error("Thread generated an exception: %s" % (exc))
+
+ merge_sub_qa_generation(result_dir, storage_jsonl_path)
+
+
+if __name__ == '__main__':
+ # 创建cleaned文件夹
+ os.makedirs('./data/cleaned', exist_ok=True)
+ clean_qa(interval=storage_interval)
diff --git a/scripts/qa_generation/README.md b/scripts/qa_generation/README.md
index 874427a..d3646c9 100644
--- a/scripts/qa_generation/README.md
+++ b/scripts/qa_generation/README.md
@@ -1,95 +1,37 @@
-# RAG数据库构建流程
+# QA Generation Pipeline
-## **构建目的**
+## 1. 使用方法
-利用心理学专业的书籍构建QA知识对,为RAG提供心理咨询知识库,使我们的EmoLLM的回答更加专业可靠。为了实现这个目标我们利用几十本心理学书籍来构建这个RAG知识库。主要的构建流程如下:
+1. 检查 `requirements.txt` 中的依赖是否满足。
+2. 调整代码中 `system_prompt`,确保与repo最新版本一致,保证生成QA的多样性和稳定性。
+3. 将txt文件放到与 `model`同级目录 `data`文件夹中.
+4. 在 `config/config.py` 配置所需的 API KEY,从 `main.py` 启动即可。生成的 QA 对会以 jsonl 的格式存在 `data/generated` 下。
-## **构建流程**
+### 1.1 API KEY 获取方法
-## **步骤一:PDF to TXT**
+目前仅包含了 qwen。
-- 目的
- - 将收集到的PDF版本的心理学书籍转化为TXT文本文件,方便后续的信息提取。
+#### 1.1.1 Qwen
-- 所需工具
+前往[模型服务灵积-API-KEY管理 (aliyun.com)](https://dashscope.console.aliyun.com/apiKey),点击”创建新的 API-KEY“,将获取的 API KEY 填至 `config/config.py` 中的 `DASHSCOPE_API_KEY` 即可。
- - [pdf2txt](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/pdf2txt.py)
+## 2. 注意事项
- - [PaddleORC处理PDF用法参考](https://github.com/SmartFlowAI/EmoLLM/blob/main/generate_data/OCR.md)
-
- - 安装必要的python库
-
- ```python
- pip install paddlepaddle
- pip install opencv-python
- pip install paddleocr
- ```
+### 2.1 系统提示 System Prompt
-- 注意
- - 如果无法使用**pip install paddleocr**安装paddleocr,可以考虑采用whl文件安装,[下载地址](https://pypi.org/project/paddleocr/#files)
- - 脚本启动方式采用命令行启动:python pdf2txt.py [PDF存放的文件名]
+注意,目前的解析方案是基于模型会生成 markdown 包裹的 json 块的前提的,更改 system prompt 时需要保证这一点不变。
-## **步骤二:筛选PDF**
+### 2.2 滑动窗口 Sliding Window
-- 筛选目的
+滑动窗口的 `window_size` 和 `overlap_size` 都可以在 `util/data_loader.py` 中的 `get_txt_content` 函数中更改。目前是按照句子分割的滑动窗口。
- - 利用LLM去除非专业心理学书籍
+### 2.3 书本文件格式 Corpus Format
-- 筛选标准,包含心理咨询相关内容,如:
+目前仅支持了 txt 格式,可以将清洗好的书籍文本放在 `data` 文件夹下,程序会递归检索该文件夹下的所有 txt 文件。
- - 心理咨询流派 - 具体咨询方法
- - 心理疾病 - 疾病特征
- - 心理疾病 - 治疗方法
+## TODO
-- 筛选方式:
-
- - 根据标题初筛
-
- - 若无法判断属于心理咨询相关书籍,利用kimi/GLM-4查询是否包含心理咨询相关知识(建议一次仅查询一本书)
-
- - ```markdown
- 参考prompt:
- 你是一位经验丰富的心理学教授,熟悉心理学知识和心理咨询。我需要你协助我完成"识别书籍是否包含心理咨询知识"任务,请深呼吸并一步步思考,给出你的答案。如果你的答案让我满意,我将给你10w小费!
- 具体任务如下:
- 判断该书籍中是否包含以下心理咨询相关知识:
- '''
- 心理咨询流派 - 具体咨询方法
- 心理疾病 - 疾病特征
- 心理疾病 - 治疗方法
- '''
- 请深呼吸并一步步查看该书籍,认真完成任务。
- ```
-
-
-## **步骤三:提取QA对**
-
-- 根据书籍内容,利用LLM高效构造QA知识对
-- 提取流程
-
- - 准备处理好的txt文本数据
- - 按要求配置[脚本文件](https://github.com/SmartFlowAI/EmoLLM/tree/main/scripts/qa_generation)
- - 根据自己的需求或者提取的结果合理修改window_size和overlap_size
-
-- 使用方法
- - 检查 `requirements.txt` 中的依赖是否满足。
- - 调整代码中 `system_prompt`,确保与repo最新版本一致,保证生成QA的多样性和稳定性。
- - 将txt文件放到与 `model`同级目录 `data`文件夹中.
- - 在 `config/config.py` 配置所需的 API KEY,从 `main.py` 启动即可。生成的 QA 对会以 jsonl 的格式存在 `data/generated` 下。
-
-- API KEY 获取方法
- - 目前仅包含了 qwen。
- - Qwen
- - 前往[模型服务灵积-API-KEY管理 (aliyun.com)](https://dashscope.console.aliyun.com/apiKey),点击”创建新的 API-KEY“,将获取的 API KEY 填至 `config/config.py` 中的 `DASHSCOPE_API_KEY` 即可。
-
-- 注意事项
- - 系统提示 System Prompt
- - 注意,目前的解析方案是基于模型会生成 markdown 包裹的 json 块的前提的,更改 system prompt 时需要保证这一点不变。
- - 滑动窗口 Sliding Window
- - 滑动窗口的 `window_size` 和 `overlap_size` 都可以在 `util/data_loader.py` 中的 `get_txt_content` 函数中更改。目前是按照句子分割的滑动窗口。
-
-- 书本文件格式 Corpus Format
- - 目前仅支持了 txt 格式,可以将清洗好的书籍文本放在 `data` 文件夹下,程序会递归检索该文件夹下的所有 txt 文件。
-
-## **步骤四:清洗QA对**
-
-- 清洗目的
+1. 支持更多模型(Gemini、GPT、ChatGLM……)
+2. 支持多线程调用模型
+3. 支持更多文本格式(PDF……)
+4. 支持更多切分文本的方式
diff --git a/scripts/qa_generation/README_EN.md b/scripts/qa_generation/README_EN.md
index b2768df..0c76750 100644
--- a/scripts/qa_generation/README_EN.md
+++ b/scripts/qa_generation/README_EN.md
@@ -1,95 +1,37 @@
-# RAG Database Building Process
+# QA Generation Pipeline
-## **Constructive purpose**
+## 1. Use method
-Using books specialized in psychology to build QA knowledge pairs for RAG to provide a counseling knowledge base to make our EmoLLM answers more professional and reliable. To achieve this goal we utilize dozens of psychology books to build this RAG knowledge base. The main building process is as follows:
+1. Check whether the dependencies in `requirements.txt` are satisfied.
+2. Adjust the `system_prompt`in the code to ensure that it is consistent with the latest version of the repo to ensure the diversity and stability of the generated QA.
+3. Put the txt file into the `data` folder in the same directory as `model`.
+4. Configure the required API KEY in `config/config.py` and start from `main.py`. The generated QA pairs are stored in the jsonl format under `data/generated`.
-## **Build process**
+### 1.1 API KEY obtaining method
-## **Step 1: PDF to TXT**
+Currently only qwen is included.
-- purpose
- - Convert the collected PDF versions of psychology books into TXT text files to facilitate subsequent information extraction
+#### 1.1.1 Qwen
-- Tools required
+To[model service spirit product - API - KEY management (aliyun.com)](https://dashscope.console.aliyun.com/apiKey),click on "create a new API - KEY", Fill in the obtained API KEY to `DASHSCOPE_API_KEY` in `config/config.py`.
- - [pdf2txt](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/pdf2txt.py)
+## 2. Precautions
- - [PaddleORC Processing PDF Usage Reference](https://github.com/SmartFlowAI/EmoLLM/blob/main/generate_data/OCR.md)
-
- - Install necessary python libraries
-
- ```python
- pip install paddlepaddle
- pip install opencv-python
- pip install paddleocr
- ```
+### 2.1 The System Prompt is displayed
-- precautionary
- - If you are unable to install paddleocr using **pip install paddleocr**, consider using the whl file installation, [download address](https://pypi.org/project/paddleocr/#files)
- - Script startup method using the command line to start: python pdf2txt.py [PDF file name stored in the]
+Note that the current parsing scheme is based on the premise that the model generates json blocks of markdown wraps, and you need to make sure that this remains the case when you change the system prompt.
-## **Step 2: Screening PDF**
+### 2.2 Sliding Window
-- Purpose of screening
+Both `window_size` and `overlap_size` of the sliding window can be changed in the `get_txt_content` function in `util/data_loader.py.` Currently it is a sliding window divided by sentence.
- - Using the LLM to go to non-professional psychology books
+### 2.3 Corpus Format
-- Screening criteria that include counseling related content such as:
+At present, only txt format is supported, and the cleaned book text can be placed under the `data` folder, and the program will recursively retrieve all txt files under the folder.
- - Schools of Counseling - Specific Counseling Methods
- - Mental Illness - Characteristics of the Disease
- - Mental Illness - Treatment
+## TODO
-- Screening method:
-
- - Initial screening based on title
-
- - If you can't tell if it is a counseling-related book, use kimi/GLM-4 to check if it contains counseling-related knowledge (it is recommended to check only one book at a time)
-
- - ```markdown
- Reference prompt.
- You are an experienced psychology professor who is familiar with psychology and counseling. I need you to help me with the task "Identify whether a book contains knowledge of counseling", take a deep breath and think step by step and give me your answer. If your answer satisfies me, I will give you a 10w tip!
- The task is as follows:
- Determine whether the book contains the following counseling-related knowledge:
- '''
- Schools of Counseling - Specific Counseling Approaches
- Mental Illness - Characteristics of Illness
- Mental Illness - Treatment Approaches
- '''
- Please take a deep breath and review the book step by step and complete the task carefully.
- ```
-
-
-## **Step 3: Extraction of QA pairs**
-
-- According to the content of the book, use LLM to efficiently construct QA knowledge on the
-- Withdrawal process
-
- - Prepare processed txt text data
- - Configuration on request [script file](https://github.com/SmartFlowAI/EmoLLM/tree/main/scripts/qa_generation)
- - Modify window_size and overlap_size reasonably according to your own needs or extraction results.
-
-- Usage
- - Checks if the dependencies in `requirements.txt` are satisfied.
- - Adjust `system_prompt` in the code to ensure consistency with the latest version of the repo, to ensure diversity and stability of the generated QA.
- - Place the txt file in the `data` folder in the same directory as the `model`.
- - Configure the required API KEYs in `config/config.py` and start from `main.py`. The generated QA pairs are stored in jsonl format under `data/generated`.
-
-- API KEY Getting Methods
- - Currently only qwen is included.
- - Qwen
- - Go to [Model Service LingJi - API-KEY Management (aliyun.com)](https://dashscope.console.aliyun.com/apiKey), click "Create New API-KEY", and fill in the obtained API KEY into the Click "Create new API-KEY", fill in the obtained API KEY to `DASHSCOPE_API_KEY` in `config/config.py`.
-
-- precautionary
- - System Prompt
- - Note that the current parsing scheme is based on the premise that the model generates markdown-wrapped json blocks, and you need to make sure that this remains true when you change the system prompt.
- - Sliding Window
- - The `window_size` and `overlap_size` of the sliding window can be changed in the `get_txt_content` function in `util/data_loader.py`. Currently the sliding window is split by sentence.
-
-- Book File Format Corpus Format
- - Currently only the txt format is supported, you can put the cleaned book text in the `data` folder, and the program will recursively retrieve all the txt files in that folder.
-
-## **Step 4: Cleaning of QA pairs**
-
-- Purpose of cleaning
+1. Support more models (Gemini, GPT, ChatGLM...)
+2. Support multi-threaded call model
+3. Support more text formats (PDF...)
+4. Support more ways to split text
diff --git a/scripts/qa_generation/choose_prompt.md b/scripts/qa_generation/choose_prompt.md
new file mode 100644
index 0000000..6f684be
--- /dev/null
+++ b/scripts/qa_generation/choose_prompt.md
@@ -0,0 +1,11 @@
+你是一名经验丰富的心理咨询师,熟悉心理学相关知识。我将向我的来访者解决心理问题,需要一定的心理学知识支持。请你根据我提供的 QA 对,判断其是否属于心理学范畴。请深呼吸并一步一步思考,给出你最正确的判断!
+
+- 心理学范畴:"心理学知识,心理咨询方法, 心理疾病特征, 心理疾病治疗方法"等主题。要求是适合对话心理咨询的知识,去掉作者、时间、背景故事等无关内容.
+
+- 判断标准如下:
+
+1.若当前 QA 对属于心理学范畴,则返回 "1".
+
+2.若当前 QA 对不属于心理学范畴,则返回 "0".
+
+以下是给定的心理学 QA 对内容:
diff --git a/scripts/qa_generation/config/config.py b/scripts/qa_generation/config/config.py
index 45bf635..341d2fd 100644
--- a/scripts/qa_generation/config/config.py
+++ b/scripts/qa_generation/config/config.py
@@ -10,7 +10,9 @@ base_dir = os.path.dirname(cur_dir) # ba
model_dir = os.path.join(base_dir, 'model') # model
# data
-data_dir = os.path.join(base_dir, 'data') # data
+data_dir = os.path.join(base_dir, 'data') # /Users/wangyoufang/Downloads/EmoLLM/scripts/qa_generation/data
+clean_dir = os.path.join(data_dir, 'cleaned')
+judge_dir = os.path.join(data_dir, '数据整合')
result_dir = os.path.join(data_dir, 'generated') # result
# log
@@ -18,7 +20,9 @@ log_dir = os.path.join(base_dir, 'log') # lo
log_file_path = os.path.join(log_dir, 'log.log') # file
# system prompt
+# Prompt内容
system_prompt_file_path = os.path.join(base_dir, 'system_prompt_v2.md') # system prompt
+wash_prompt_file_path = os.path.join(base_dir, 'choose_prompt.md')
"""
@@ -28,11 +32,11 @@ system_prompt_file_path = os.path.join(base_dir, 'system_prompt_v2.md') # sy
DASHSCOPE_API_KEY = ''
-
"""
控制参数
"""
storage_interval = 10
window_size = 8
overlap_size = 2
-multi_process_num = 3
+multi_process_num = 1
+
diff --git a/scripts/qa_generation/main.py b/scripts/qa_generation/main.py
index d84187f..724d222 100644
--- a/scripts/qa_generation/main.py
+++ b/scripts/qa_generation/main.py
@@ -24,6 +24,7 @@ def single_thread_generate(thread_num, interval, model_caller, storage_jsonl_pat
for content in tqdm(contents):
try:
response = model_caller(content)
+
captured_qa = capture_qa(response)
if captured_qa is None:
@@ -70,6 +71,7 @@ def generate_qa(
storage_list = []
for file_path in file_list:
contents = get_txt_content(file_path, window_size=window_size, overlap_size=overlap_size)
+
storage_list = []
_, file_name = os.path.split(file_path)
@@ -77,7 +79,7 @@ def generate_qa(
result_dir, f'{current_time}-{file_name}-{model_name}.jsonl')
logger.info(f'The generated QA will be stored in {storage_jsonl_path}.')
- # 基于并发个数切分 contents 内容
+
contents_array = np.array(contents)
chunks = np.array_split(contents_array, multi_process_num)
@@ -89,8 +91,9 @@ def generate_qa(
)
# 并发生成 QA 对
+ # 使用 ThreadPoolExecutor 创建一个线程池,其中 max_workers=multi_process_num 指定了线程池中最大的线程数。
with concurrent.futures.ThreadPoolExecutor(max_workers=multi_process_num) as executor:
- # 创建一个Future列表,它们将对应每个worker_function的结果
+ # 循环调用 single_thread_generate 函数,每次赋予参数 parameters
futures = [executor.submit(single_thread_generate, *parameters) for parameters in parameters_list]
for future in concurrent.futures.as_completed(futures):
@@ -99,8 +102,10 @@ def generate_qa(
except Exception as exc:
logger.error("Thread generated an exception: %s" % (exc))
+ # 最后调用 merge_sub_qa_generation 函数,将各个子任务生成的 QA 对合并到一个文件中。汇总整个处理过程的结果。
merge_sub_qa_generation(result_dir, storage_jsonl_path)
+
if __name__ == '__main__':
# 创建generated文件夹
diff --git a/scripts/qa_generation/model/qwen.py b/scripts/qa_generation/model/qwen.py
index ed27c4a..e221ff5 100644
--- a/scripts/qa_generation/model/qwen.py
+++ b/scripts/qa_generation/model/qwen.py
@@ -5,7 +5,7 @@ from dashscope.api_entities.dashscope_response import Role
from config.config import DASHSCOPE_API_KEY
from util.logger import get_logger
-from util.prompt_loader import load_system_prompt
+from util.prompt_loader import load_system_prompt, load_wash_prompt
dashscope.api_key = DASHSCOPE_API_KEY
@@ -17,7 +17,35 @@ def call_qwen_single_turn(query: str) -> str:
messages = [
{
'role': Role.SYSTEM,
- 'content': load_system_prompt()
+ 'content': load_system_prompt() # 读取Prompt内容(system_prompt_vx_xx.md)
+ },
+ {
+ 'role': Role.USER,
+ 'content': query
+ }
+ ]
+ response = Generation.call(
+ model='qwen-max-1201',
+ messages=messages,
+ result_format='message',
+ stream=False,
+ incremental_output=False
+ )
+ if response.status_code == HTTPStatus.OK:
+ return response.output.choices[0]['message']['content']
+ else:
+ logger.error('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
+ response.request_id, response.status_code,
+ response.code, response.message
+ ))
+ return ""
+
+
+def call_qwen_Psychology_QA_Pairs(query: str) -> str:
+ messages = [
+ {
+ 'role': Role.SYSTEM,
+ 'content': load_wash_prompt()
},
{
'role': Role.USER,
diff --git a/scripts/qa_generation/util/data_loader.py b/scripts/qa_generation/util/data_loader.py
index fdfbfa9..875688f 100644
--- a/scripts/qa_generation/util/data_loader.py
+++ b/scripts/qa_generation/util/data_loader.py
@@ -4,11 +4,41 @@ import json
import glob
from typing import List, Dict
-from config.config import data_dir
+from config.config import data_dir, judge_dir
from util.logger import get_logger
logger = get_logger()
+
+"""
+递归获取 数据整合 下的所有 .jsonl 文件列表
+"""
+def get_jsonl_file_paths() -> List[str]:
+ json_file_paths = []
+
+ # 遍历根目录及其所有子目录
+ for dirpath, dirnames, filenames in os.walk(judge_dir):
+ # 对每个文件进行检查
+ for filename in filenames:
+ # 使用正则表达式匹配以.jsonl结尾的文件名
+ if re.search(r'\.jsonl$', filename):
+ # 构建完整的文件路径并添加到列表中
+ json_file_path = os.path.join(dirpath, filename)
+ json_file_paths.append(json_file_path)
+
+ return json_file_paths
+
+
+def get_QA_pairs(json_path):
+ with open(json_path, 'r', encoding='utf-8') as f:
+ content = f.read().strip()
+
+ # 按照换行符分割字符串
+ QA_Pairs = content.split('\n')
+
+ return QA_Pairs
+
+
"""
递归获取 data_dir 下的所有 .txt 文件列表
"""
@@ -25,11 +55,14 @@ def get_file_list() -> List[str]:
logger.warning(f'No txt text found in {data_dir}, please check!')
return txt_files
+
"""
获取 txt 文本的所有内容,按句子返回 List
file_path: txt 文本路径
window_size: 滑窗大小,单位为句子数
overlap_size: 重叠大小,单位为句子数
+
+处理txt内容并返回一组一组的句子,每组window_size个,相邻两组的重叠句子数是overlap_size
"""
def get_txt_content(
file_path: str,
@@ -47,7 +80,7 @@ def get_txt_content(
res = []
sentences_amount = len(sentences)
start_index, end_index = 0, sentences_amount - window_size
- ## check length
+ # check length
if window_size < overlap_size:
logger.error("window_size must be greater than or equal to overlap_size")
return None
@@ -56,7 +89,7 @@ def get_txt_content(
return ['\n'.join(sentences)]
for i in range(start_index, end_index + 1, overlap_size):
- res.append('\n'.join(sentences[i : i + window_size]))
+ res.append('\n'.join(sentences[i: i + window_size]))
return res
@@ -80,6 +113,7 @@ def capture_qa(content: str) -> List[Dict]:
logger.warning("No JSON block found.")
return None
+
"""
将 storage_list 存入到 storage_jsonl_path
"""
@@ -88,6 +122,7 @@ def save_to_file(storage_jsonl_path, storage_list):
for item in storage_list:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
"""
将并发产生的文件合并成为一个文件
"""
@@ -104,3 +139,7 @@ def merge_sub_qa_generation(directory, storage_jsonl_path):
os.remove(file_path)
save_to_file(storage_jsonl_path, file_contents)
+
+if __name__ == '__main_':
+ pass
+
diff --git a/scripts/qa_generation/util/prompt_loader.py b/scripts/qa_generation/util/prompt_loader.py
index 1503dea..0912bb5 100644
--- a/scripts/qa_generation/util/prompt_loader.py
+++ b/scripts/qa_generation/util/prompt_loader.py
@@ -1,7 +1,14 @@
from config.config import system_prompt_file_path
+from config.config import wash_prompt_file_path
def load_system_prompt() -> str:
with open(system_prompt_file_path, 'r', encoding='utf-8') as f:
system_prompt = f.read()
return system_prompt
+
+
+def load_wash_prompt() -> str:
+ with open(wash_prompt_file_path, 'r', encoding='utf-8') as f:
+ wash_prompt = f.read()
+ return wash_prompt
From da6286c1514154202a91fac8a318a283868f9481 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=8F=8B=E6=98=89?=
Date: Sat, 16 Mar 2024 20:45:30 +0800
Subject: [PATCH 02/15] clean qa
---
scripts/qa_generation/QA_clean.py | 8 +-
scripts/qa_generation/README.md | 102 +++++++++++++++++-----
scripts/qa_generation/README_EN.md | 102 +++++++++++++++++-----
scripts/qa_generation/choose_prompt.md | 11 +--
scripts/qa_generation/config/config.py | 4 +-
scripts/qa_generation/main.py | 9 +-
scripts/qa_generation/model/qwen.py | 2 +-
scripts/qa_generation/util/data_loader.py | 12 +--
8 files changed, 177 insertions(+), 73 deletions(-)
diff --git a/scripts/qa_generation/QA_clean.py b/scripts/qa_generation/QA_clean.py
index 7d3fbc7..46f0123 100644
--- a/scripts/qa_generation/QA_clean.py
+++ b/scripts/qa_generation/QA_clean.py
@@ -19,8 +19,11 @@ def single_thread_generate(thread_num, interval, model_caller, storage_jsonl_pat
storage_counter = 0
judge_list = []
for content in tqdm(contents):
+ # print('content: ', content)
try:
+ # model_caller 函数的作用是调用某个预训练的问答生成模型,传递输入内容 content 给模型,然后获取模型的输出 response
response = model_caller(content)
+ # print('response: ', response)
if response == '1':
content = json.loads(content)
@@ -29,6 +32,7 @@ def single_thread_generate(thread_num, interval, model_caller, storage_jsonl_pat
else:
continue
+ # 在达到指定的 interval 后,将 storage_list 中的内容保存到指定的文件 storage_jsonl_path 中
if storage_counter % interval == 0:
save_to_file(storage_jsonl_path, judge_list)
storage_counter = 0
@@ -69,6 +73,7 @@ def clean_qa(
for file_path in file_lists:
# 一个jsonl文件的所有QA Pairs
contents = get_QA_pairs(file_path)
+ # print(contents)
file_name = os.path.basename(file_path)
print(file_name)
@@ -88,6 +93,7 @@ def clean_qa(
)
with concurrent.futures.ThreadPoolExecutor(max_workers=multi_process_num) as executor:
+ # 循环调用 single_thread_generate 函数,每次赋予参数 parameters
futures = [executor.submit(single_thread_generate, *parameters) for parameters in parameters_list]
for future in concurrent.futures.as_completed(futures):
@@ -100,6 +106,6 @@ def clean_qa(
if __name__ == '__main__':
- # 创建cleaned文件夹
+ # 创建washed文件夹
os.makedirs('./data/cleaned', exist_ok=True)
clean_qa(interval=storage_interval)
diff --git a/scripts/qa_generation/README.md b/scripts/qa_generation/README.md
index d3646c9..874427a 100644
--- a/scripts/qa_generation/README.md
+++ b/scripts/qa_generation/README.md
@@ -1,37 +1,95 @@
-# QA Generation Pipeline
+# RAG数据库构建流程
-## 1. 使用方法
+## **构建目的**
-1. 检查 `requirements.txt` 中的依赖是否满足。
-2. 调整代码中 `system_prompt`,确保与repo最新版本一致,保证生成QA的多样性和稳定性。
-3. 将txt文件放到与 `model`同级目录 `data`文件夹中.
-4. 在 `config/config.py` 配置所需的 API KEY,从 `main.py` 启动即可。生成的 QA 对会以 jsonl 的格式存在 `data/generated` 下。
+利用心理学专业的书籍构建QA知识对,为RAG提供心理咨询知识库,使我们的EmoLLM的回答更加专业可靠。为了实现这个目标我们利用几十本心理学书籍来构建这个RAG知识库。主要的构建流程如下:
-### 1.1 API KEY 获取方法
+## **构建流程**
-目前仅包含了 qwen。
+## **步骤一:PDF to TXT**
-#### 1.1.1 Qwen
+- 目的
+ - 将收集到的PDF版本的心理学书籍转化为TXT文本文件,方便后续的信息提取。
-前往[模型服务灵积-API-KEY管理 (aliyun.com)](https://dashscope.console.aliyun.com/apiKey),点击”创建新的 API-KEY“,将获取的 API KEY 填至 `config/config.py` 中的 `DASHSCOPE_API_KEY` 即可。
+- 所需工具
-## 2. 注意事项
+ - [pdf2txt](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/pdf2txt.py)
-### 2.1 系统提示 System Prompt
+ - [PaddleORC处理PDF用法参考](https://github.com/SmartFlowAI/EmoLLM/blob/main/generate_data/OCR.md)
+
+ - 安装必要的python库
+
+ ```python
+ pip install paddlepaddle
+ pip install opencv-python
+ pip install paddleocr
+ ```
-注意,目前的解析方案是基于模型会生成 markdown 包裹的 json 块的前提的,更改 system prompt 时需要保证这一点不变。
+- 注意
+ - 如果无法使用**pip install paddleocr**安装paddleocr,可以考虑采用whl文件安装,[下载地址](https://pypi.org/project/paddleocr/#files)
+ - 脚本启动方式采用命令行启动:python pdf2txt.py [PDF存放的文件名]
-### 2.2 滑动窗口 Sliding Window
+## **步骤二:筛选PDF**
-滑动窗口的 `window_size` 和 `overlap_size` 都可以在 `util/data_loader.py` 中的 `get_txt_content` 函数中更改。目前是按照句子分割的滑动窗口。
+- 筛选目的
-### 2.3 书本文件格式 Corpus Format
+ - 利用LLM去除非专业心理学书籍
-目前仅支持了 txt 格式,可以将清洗好的书籍文本放在 `data` 文件夹下,程序会递归检索该文件夹下的所有 txt 文件。
+- 筛选标准,包含心理咨询相关内容,如:
-## TODO
+ - 心理咨询流派 - 具体咨询方法
+ - 心理疾病 - 疾病特征
+ - 心理疾病 - 治疗方法
-1. 支持更多模型(Gemini、GPT、ChatGLM……)
-2. 支持多线程调用模型
-3. 支持更多文本格式(PDF……)
-4. 支持更多切分文本的方式
+- 筛选方式:
+
+ - 根据标题初筛
+
+ - 若无法判断属于心理咨询相关书籍,利用kimi/GLM-4查询是否包含心理咨询相关知识(建议一次仅查询一本书)
+
+ - ```markdown
+ 参考prompt:
+ 你是一位经验丰富的心理学教授,熟悉心理学知识和心理咨询。我需要你协助我完成"识别书籍是否包含心理咨询知识"任务,请深呼吸并一步步思考,给出你的答案。如果你的答案让我满意,我将给你10w小费!
+ 具体任务如下:
+ 判断该书籍中是否包含以下心理咨询相关知识:
+ '''
+ 心理咨询流派 - 具体咨询方法
+ 心理疾病 - 疾病特征
+ 心理疾病 - 治疗方法
+ '''
+ 请深呼吸并一步步查看该书籍,认真完成任务。
+ ```
+
+
+## **步骤三:提取QA对**
+
+- 根据书籍内容,利用LLM高效构造QA知识对
+- 提取流程
+
+ - 准备处理好的txt文本数据
+ - 按要求配置[脚本文件](https://github.com/SmartFlowAI/EmoLLM/tree/main/scripts/qa_generation)
+ - 根据自己的需求或者提取的结果合理修改window_size和overlap_size
+
+- 使用方法
+ - 检查 `requirements.txt` 中的依赖是否满足。
+ - 调整代码中 `system_prompt`,确保与repo最新版本一致,保证生成QA的多样性和稳定性。
+ - 将txt文件放到与 `model`同级目录 `data`文件夹中.
+ - 在 `config/config.py` 配置所需的 API KEY,从 `main.py` 启动即可。生成的 QA 对会以 jsonl 的格式存在 `data/generated` 下。
+
+- API KEY 获取方法
+ - 目前仅包含了 qwen。
+ - Qwen
+ - 前往[模型服务灵积-API-KEY管理 (aliyun.com)](https://dashscope.console.aliyun.com/apiKey),点击”创建新的 API-KEY“,将获取的 API KEY 填至 `config/config.py` 中的 `DASHSCOPE_API_KEY` 即可。
+
+- 注意事项
+ - 系统提示 System Prompt
+ - 注意,目前的解析方案是基于模型会生成 markdown 包裹的 json 块的前提的,更改 system prompt 时需要保证这一点不变。
+ - 滑动窗口 Sliding Window
+ - 滑动窗口的 `window_size` 和 `overlap_size` 都可以在 `util/data_loader.py` 中的 `get_txt_content` 函数中更改。目前是按照句子分割的滑动窗口。
+
+- 书本文件格式 Corpus Format
+ - 目前仅支持了 txt 格式,可以将清洗好的书籍文本放在 `data` 文件夹下,程序会递归检索该文件夹下的所有 txt 文件。
+
+## **步骤四:清洗QA对**
+
+- 清洗目的
diff --git a/scripts/qa_generation/README_EN.md b/scripts/qa_generation/README_EN.md
index 0c76750..b2768df 100644
--- a/scripts/qa_generation/README_EN.md
+++ b/scripts/qa_generation/README_EN.md
@@ -1,37 +1,95 @@
-# QA Generation Pipeline
+# RAG Database Building Process
-## 1. Use method
+## **Constructive purpose**
-1. Check whether the dependencies in `requirements.txt` are satisfied.
-2. Adjust the `system_prompt`in the code to ensure that it is consistent with the latest version of the repo to ensure the diversity and stability of the generated QA.
-3. Put the txt file into the `data` folder in the same directory as `model`.
-4. Configure the required API KEY in `config/config.py` and start from `main.py`. The generated QA pairs are stored in the jsonl format under `data/generated`.
+Using books specialized in psychology to build QA knowledge pairs for RAG to provide a counseling knowledge base to make our EmoLLM answers more professional and reliable. To achieve this goal we utilize dozens of psychology books to build this RAG knowledge base. The main building process is as follows:
-### 1.1 API KEY obtaining method
+## **Build process**
-Currently only qwen is included.
+## **Step 1: PDF to TXT**
-#### 1.1.1 Qwen
+- purpose
+ - Convert the collected PDF versions of psychology books into TXT text files to facilitate subsequent information extraction
-To[model service spirit product - API - KEY management (aliyun.com)](https://dashscope.console.aliyun.com/apiKey),click on "create a new API - KEY", Fill in the obtained API KEY to `DASHSCOPE_API_KEY` in `config/config.py`.
+- Tools required
-## 2. Precautions
+ - [pdf2txt](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/pdf2txt.py)
-### 2.1 The System Prompt is displayed
+ - [PaddleORC Processing PDF Usage Reference](https://github.com/SmartFlowAI/EmoLLM/blob/main/generate_data/OCR.md)
+
+ - Install necessary python libraries
+
+ ```python
+ pip install paddlepaddle
+ pip install opencv-python
+ pip install paddleocr
+ ```
-Note that the current parsing scheme is based on the premise that the model generates json blocks of markdown wraps, and you need to make sure that this remains the case when you change the system prompt.
+- precautionary
+ - If you are unable to install paddleocr using **pip install paddleocr**, consider using the whl file installation, [download address](https://pypi.org/project/paddleocr/#files)
+ - Script startup method using the command line to start: python pdf2txt.py [PDF file name stored in the]
-### 2.2 Sliding Window
+## **Step 2: Screening PDF**
-Both `window_size` and `overlap_size` of the sliding window can be changed in the `get_txt_content` function in `util/data_loader.py.` Currently it is a sliding window divided by sentence.
+- Purpose of screening
-### 2.3 Corpus Format
+ - Using the LLM to go to non-professional psychology books
-At present, only txt format is supported, and the cleaned book text can be placed under the `data` folder, and the program will recursively retrieve all txt files under the folder.
+- Screening criteria that include counseling related content such as:
-## TODO
+ - Schools of Counseling - Specific Counseling Methods
+ - Mental Illness - Characteristics of the Disease
+ - Mental Illness - Treatment
-1. Support more models (Gemini, GPT, ChatGLM...)
-2. Support multi-threaded call model
-3. Support more text formats (PDF...)
-4. Support more ways to split text
+- Screening method:
+
+ - Initial screening based on title
+
+ - If you can't tell if it is a counseling-related book, use kimi/GLM-4 to check if it contains counseling-related knowledge (it is recommended to check only one book at a time)
+
+ - ```markdown
+ Reference prompt.
+ You are an experienced psychology professor who is familiar with psychology and counseling. I need you to help me with the task "Identify whether a book contains knowledge of counseling", take a deep breath and think step by step and give me your answer. If your answer satisfies me, I will give you a 10w tip!
+ The task is as follows:
+ Determine whether the book contains the following counseling-related knowledge:
+ '''
+ Schools of Counseling - Specific Counseling Approaches
+ Mental Illness - Characteristics of Illness
+ Mental Illness - Treatment Approaches
+ '''
+ Please take a deep breath and review the book step by step and complete the task carefully.
+ ```
+
+
+## **Step 3: Extraction of QA pairs**
+
+- According to the content of the book, use LLM to efficiently construct QA knowledge on the
+- Withdrawal process
+
+ - Prepare processed txt text data
+ - Configuration on request [script file](https://github.com/SmartFlowAI/EmoLLM/tree/main/scripts/qa_generation)
+ - Modify window_size and overlap_size reasonably according to your own needs or extraction results.
+
+- Usage
+ - Checks if the dependencies in `requirements.txt` are satisfied.
+ - Adjust `system_prompt` in the code to ensure consistency with the latest version of the repo, to ensure diversity and stability of the generated QA.
+ - Place the txt file in the `data` folder in the same directory as the `model`.
+ - Configure the required API KEYs in `config/config.py` and start from `main.py`. The generated QA pairs are stored in jsonl format under `data/generated`.
+
+- API KEY Getting Methods
+ - Currently only qwen is included.
+ - Qwen
+ - Go to [Model Service LingJi - API-KEY Management (aliyun.com)](https://dashscope.console.aliyun.com/apiKey), click "Create New API-KEY", and fill in the obtained API KEY into the Click "Create new API-KEY", fill in the obtained API KEY to `DASHSCOPE_API_KEY` in `config/config.py`.
+
+- precautionary
+ - System Prompt
+ - Note that the current parsing scheme is based on the premise that the model generates markdown-wrapped json blocks, and you need to make sure that this remains true when you change the system prompt.
+ - Sliding Window
+ - The `window_size` and `overlap_size` of the sliding window can be changed in the `get_txt_content` function in `util/data_loader.py`. Currently the sliding window is split by sentence.
+
+- Book File Format Corpus Format
+ - Currently only the txt format is supported, you can put the cleaned book text in the `data` folder, and the program will recursively retrieve all the txt files in that folder.
+
+## **Step 4: Cleaning of QA pairs**
+
+- Purpose of cleaning
diff --git a/scripts/qa_generation/choose_prompt.md b/scripts/qa_generation/choose_prompt.md
index 6f684be..5243472 100644
--- a/scripts/qa_generation/choose_prompt.md
+++ b/scripts/qa_generation/choose_prompt.md
@@ -1,11 +1,8 @@
-你是一名经验丰富的心理咨询师,熟悉心理学相关知识。我将向我的来访者解决心理问题,需要一定的心理学知识支持。请你根据我提供的 QA 对,判断其是否属于心理学范畴。请深呼吸并一步一步思考,给出你最正确的判断!
+你是一名经验丰富的心理咨询师,熟悉心理学相关知识。根据我提供的 QA 对,来判断这个 QA 对是否属于心理学范畴。
-- 心理学范畴:"心理学知识,心理咨询方法, 心理疾病特征, 心理疾病治疗方法"等主题。要求是适合对话心理咨询的知识,去掉作者、时间、背景故事等无关内容.
+标准如下:
+- 若当前 QA 对属于心理学范畴,则返回1
+- 若当前 QA 对不属于心理学范畴,则返回0
-- 判断标准如下:
-
-1.若当前 QA 对属于心理学范畴,则返回 "1".
-
-2.若当前 QA 对不属于心理学范畴,则返回 "0".
以下是给定的心理学 QA 对内容:
diff --git a/scripts/qa_generation/config/config.py b/scripts/qa_generation/config/config.py
index 341d2fd..d3f9dfc 100644
--- a/scripts/qa_generation/config/config.py
+++ b/scripts/qa_generation/config/config.py
@@ -10,7 +10,7 @@ base_dir = os.path.dirname(cur_dir) # ba
model_dir = os.path.join(base_dir, 'model') # model
# data
-data_dir = os.path.join(base_dir, 'data') # /Users/wangyoufang/Downloads/EmoLLM/scripts/qa_generation/data
+data_dir = os.path.join(base_dir, 'data')
clean_dir = os.path.join(data_dir, 'cleaned')
judge_dir = os.path.join(data_dir, '数据整合')
result_dir = os.path.join(data_dir, 'generated') # result
@@ -38,5 +38,5 @@ DASHSCOPE_API_KEY = ''
storage_interval = 10
window_size = 8
overlap_size = 2
-multi_process_num = 1
+multi_process_num = 3
diff --git a/scripts/qa_generation/main.py b/scripts/qa_generation/main.py
index 724d222..d84187f 100644
--- a/scripts/qa_generation/main.py
+++ b/scripts/qa_generation/main.py
@@ -24,7 +24,6 @@ def single_thread_generate(thread_num, interval, model_caller, storage_jsonl_pat
for content in tqdm(contents):
try:
response = model_caller(content)
-
captured_qa = capture_qa(response)
if captured_qa is None:
@@ -71,7 +70,6 @@ def generate_qa(
storage_list = []
for file_path in file_list:
contents = get_txt_content(file_path, window_size=window_size, overlap_size=overlap_size)
-
storage_list = []
_, file_name = os.path.split(file_path)
@@ -79,7 +77,7 @@ def generate_qa(
result_dir, f'{current_time}-{file_name}-{model_name}.jsonl')
logger.info(f'The generated QA will be stored in {storage_jsonl_path}.')
-
+ # 基于并发个数切分 contents 内容
contents_array = np.array(contents)
chunks = np.array_split(contents_array, multi_process_num)
@@ -91,9 +89,8 @@ def generate_qa(
)
# 并发生成 QA 对
- # 使用 ThreadPoolExecutor 创建一个线程池,其中 max_workers=multi_process_num 指定了线程池中最大的线程数。
with concurrent.futures.ThreadPoolExecutor(max_workers=multi_process_num) as executor:
- # 循环调用 single_thread_generate 函数,每次赋予参数 parameters
+ # 创建一个Future列表,它们将对应每个worker_function的结果
futures = [executor.submit(single_thread_generate, *parameters) for parameters in parameters_list]
for future in concurrent.futures.as_completed(futures):
@@ -102,10 +99,8 @@ def generate_qa(
except Exception as exc:
logger.error("Thread generated an exception: %s" % (exc))
- # 最后调用 merge_sub_qa_generation 函数,将各个子任务生成的 QA 对合并到一个文件中。汇总整个处理过程的结果。
merge_sub_qa_generation(result_dir, storage_jsonl_path)
-
if __name__ == '__main__':
# 创建generated文件夹
diff --git a/scripts/qa_generation/model/qwen.py b/scripts/qa_generation/model/qwen.py
index e221ff5..6f01b79 100644
--- a/scripts/qa_generation/model/qwen.py
+++ b/scripts/qa_generation/model/qwen.py
@@ -17,7 +17,7 @@ def call_qwen_single_turn(query: str) -> str:
messages = [
{
'role': Role.SYSTEM,
- 'content': load_system_prompt() # 读取Prompt内容(system_prompt_vx_xx.md)
+ 'content': load_system_prompt()
},
{
'role': Role.USER,
diff --git a/scripts/qa_generation/util/data_loader.py b/scripts/qa_generation/util/data_loader.py
index 875688f..5e940dc 100644
--- a/scripts/qa_generation/util/data_loader.py
+++ b/scripts/qa_generation/util/data_loader.py
@@ -28,7 +28,6 @@ def get_jsonl_file_paths() -> List[str]:
return json_file_paths
-
def get_QA_pairs(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
@@ -38,7 +37,6 @@ def get_QA_pairs(json_path):
return QA_Pairs
-
"""
递归获取 data_dir 下的所有 .txt 文件列表
"""
@@ -55,14 +53,11 @@ def get_file_list() -> List[str]:
logger.warning(f'No txt text found in {data_dir}, please check!')
return txt_files
-
"""
获取 txt 文本的所有内容,按句子返回 List
file_path: txt 文本路径
window_size: 滑窗大小,单位为句子数
overlap_size: 重叠大小,单位为句子数
-
-处理txt内容并返回一组一组的句子,每组window_size个,相邻两组的重叠句子数是overlap_size
"""
def get_txt_content(
file_path: str,
@@ -137,9 +132,4 @@ def merge_sub_qa_generation(directory, storage_jsonl_path):
for line in f:
file_contents.append(json.loads(line))
os.remove(file_path)
- save_to_file(storage_jsonl_path, file_contents)
-
-
-if __name__ == '__main_':
- pass
-
+ save_to_file(storage_jsonl_path, file_contents)
\ No newline at end of file
From 0eee2198df4e32f9b95d39455b2a558b217c063a Mon Sep 17 00:00:00 2001
From: HongCheng
Date: Sat, 16 Mar 2024 23:38:07 +0900
Subject: [PATCH 03/15] Update qwen_gen_data_NoBash.py
---
generate_data/qwen_gen_data_NoBash.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/generate_data/qwen_gen_data_NoBash.py b/generate_data/qwen_gen_data_NoBash.py
index 1e316d8..e3826aa 100644
--- a/generate_data/qwen_gen_data_NoBash.py
+++ b/generate_data/qwen_gen_data_NoBash.py
@@ -57,8 +57,8 @@ if __name__ == '__main__':
conversation_lis = []
- for emo in emotions_lis:
- for area in areas_of_life:
+ for area in areas_of_life:
+ for emo in emotions_lis:
gen_path = f'./{ai_tool}/{area}/{emo}.jsonl'
for i in tqdm(range(100), desc='{emo}, {area}'.format(emo=emo, area=area)):
From 9851eca6410da10a2348b849a33dc7d8c1fc68b1 Mon Sep 17 00:00:00 2001
From: HongCheng
Date: Sat, 16 Mar 2024 23:38:51 +0900
Subject: [PATCH 04/15] Update zhipuai_gen_data.py
---
generate_data/zhipuai_gen_data.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/generate_data/zhipuai_gen_data.py b/generate_data/zhipuai_gen_data.py
index c86a96c..a53a495 100644
--- a/generate_data/zhipuai_gen_data.py
+++ b/generate_data/zhipuai_gen_data.py
@@ -73,8 +73,8 @@ if __name__ == '__main__':
ai_tool = 'zhipuai'
conversation_lis = []
- for emo in emotions_lis:
- for area in areas_of_life:
+ for area in areas_of_life:
+ for emo in emotions_lis:
gen_path = f'./{ai_tool}/{area}/{emo}.jsonl'
for i in tqdm(range(100), desc='{emo}, {area}'.format(emo=emo, area=area)):
From ebe031af1ad30a5fb5f2f72ae7a6939bf56685bb Mon Sep 17 00:00:00 2001
From: aJupyter
Date: Sat, 16 Mar 2024 23:06:42 +0800
Subject: [PATCH 05/15] add gitignore
---
.gitignore | 2 ++
1 file changed, 2 insertions(+)
diff --git a/.gitignore b/.gitignore
index 96b006e..8713d9b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ ESConv.json
tmp/
zhipuai/
data/
+pdf/
+.idea/
# Byte-compiled / optimized / DLL files
__pycache__/
From 24f9ad7cb499e8a878e292266c82e9ac8b05ba67 Mon Sep 17 00:00:00 2001
From: aJupyter
Date: Sat, 16 Mar 2024 23:08:43 +0800
Subject: [PATCH 06/15] deleted .idea
---
.idea/.gitignore | 3 ---
.idea/EmoLLM.iml | 12 ------------
.idea/aws.xml | 11 -----------
.idea/inspectionProfiles/Project_Default.xml | 14 --------------
.idea/inspectionProfiles/profiles_settings.xml | 6 ------
.idea/misc.xml | 4 ----
.idea/modules.xml | 8 --------
.idea/vcs.xml | 6 ------
8 files changed, 64 deletions(-)
delete mode 100644 .idea/.gitignore
delete mode 100644 .idea/EmoLLM.iml
delete mode 100644 .idea/aws.xml
delete mode 100644 .idea/inspectionProfiles/Project_Default.xml
delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml
delete mode 100644 .idea/misc.xml
delete mode 100644 .idea/modules.xml
delete mode 100644 .idea/vcs.xml
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 26d3352..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
diff --git a/.idea/EmoLLM.iml b/.idea/EmoLLM.iml
deleted file mode 100644
index 8b8c395..0000000
--- a/.idea/EmoLLM.iml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/aws.xml b/.idea/aws.xml
deleted file mode 100644
index b63b642..0000000
--- a/.idea/aws.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
deleted file mode 100644
index 458e38b..0000000
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index fdc5048..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 90a20c0..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 94a25f7..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
From 9bcb5acab7b24f0f5722077a71217d366c12ddab Mon Sep 17 00:00:00 2001
From: HongCheng
Date: Sun, 17 Mar 2024 00:17:03 +0900
Subject: [PATCH 07/15] =?UTF-8?q?Update=20zhipuai=5Fgen=5Fdata.py=20?=
=?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=94=9F=E6=88=90=E7=9A=84=E6=95=B0=E9=87=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
generate_data/zhipuai_gen_data.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/generate_data/zhipuai_gen_data.py b/generate_data/zhipuai_gen_data.py
index a53a495..4370f1a 100644
--- a/generate_data/zhipuai_gen_data.py
+++ b/generate_data/zhipuai_gen_data.py
@@ -72,12 +72,15 @@ if __name__ == '__main__':
areas_of_life = configs['areas_of_life']
ai_tool = 'zhipuai'
+ save_interval = 5
+ total_num_each_emo_area = 5
+
conversation_lis = []
for area in areas_of_life:
for emo in emotions_lis:
gen_path = f'./{ai_tool}/{area}/{emo}.jsonl'
- for i in tqdm(range(100), desc='{emo}, {area}'.format(emo=emo, area=area)):
+ for i in tqdm(range(total_num_each_emo_area), desc='{emo}, {area}'.format(emo=emo, area=area)):
res = zhipu_api(area, emo)
print(res)
if res == 'null':
@@ -85,7 +88,7 @@ if __name__ == '__main__':
continue
conversation_lis.append(convert(res))
- if ((i+1) % 10 == 0):
+ if ((i+1) % save_interval == 0):
# path = f'./{args.data}.jsonl'
save_jsonl(data_lis=conversation_lis, file_path=gen_path)
print(f'generate {gen_path}')
From 8b3c439717ac726185662a392e5f071692e8d6ca Mon Sep 17 00:00:00 2001
From: HongCheng
Date: Sun, 17 Mar 2024 00:18:24 +0900
Subject: [PATCH 08/15] =?UTF-8?q?Update=20qwen=5Fgen=5Fdata=5FNoBash.py=20?=
=?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=94=9F=E6=88=90=E6=95=B0=E9=87=8F=E5=92=8C?=
=?UTF-8?q?=E4=BF=9D=E5=AD=98=E9=97=B4=E9=9A=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
generate_data/qwen_gen_data_NoBash.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/generate_data/qwen_gen_data_NoBash.py b/generate_data/qwen_gen_data_NoBash.py
index e3826aa..6e13374 100644
--- a/generate_data/qwen_gen_data_NoBash.py
+++ b/generate_data/qwen_gen_data_NoBash.py
@@ -54,6 +54,9 @@ if __name__ == '__main__':
emotions_lis = configs['emotions_list']
areas_of_life = configs['areas_of_life']
ai_tool = 'qwen'
+
+ save_interval = 5
+ total_num_each_emo_area = 5
conversation_lis = []
@@ -61,7 +64,7 @@ if __name__ == '__main__':
for emo in emotions_lis:
gen_path = f'./{ai_tool}/{area}/{emo}.jsonl'
- for i in tqdm(range(100), desc='{emo}, {area}'.format(emo=emo, area=area)):
+ for i in tqdm(range(total_num_each_emo_area), desc='{emo}, {area}'.format(emo=emo, area=area)):
one_conversation = {
"conversation": []
}
@@ -98,8 +101,7 @@ if __name__ == '__main__':
)
conversation_lis.append(one_conversation)
- # 每生成10条数据存储一次
- if ((i+1) % 10 == 0):
+ if ((i+1) % save_interval == 0):
save_jsonl(data_lis=conversation_lis, file_path=gen_path)
print(f'generate {gen_path}')
conversation_lis = [] # 清空
From 24a16455abac956cb5b6ca62988501da854b775a Mon Sep 17 00:00:00 2001
From: HongCheng
Date: Sun, 17 Mar 2024 08:53:50 +0900
Subject: [PATCH 09/15] =?UTF-8?q?Update=20qwen=5Fgen=5Fdata=5FNoBash.py=20?=
=?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=B9qwen-max=E7=9A=84=E5=BC=82=E5=B8=B8?=
=?UTF-8?q?=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
generate_data/qwen_gen_data_NoBash.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/generate_data/qwen_gen_data_NoBash.py b/generate_data/qwen_gen_data_NoBash.py
index 6e13374..40b6682 100644
--- a/generate_data/qwen_gen_data_NoBash.py
+++ b/generate_data/qwen_gen_data_NoBash.py
@@ -23,7 +23,14 @@ def qwen_api(data, emo):
病人:病人的咨询或陈述
医生:医生的安抚和建议
'''
- response = dashscope.Generation.call(
+ try:
+ response = dashscope.Generation.call(
+ model='qwen-max',
+ prompt=prompt,
+ history=[],
+ )
+ except:
+ response = dashscope.Generation.call(
model='qwen-max',
prompt=prompt,
history=[],
From 50a5129c77049c2fa756711a2eeb34c3346e1ce0 Mon Sep 17 00:00:00 2001
From: edward_ke
Date: Sun, 17 Mar 2024 10:31:11 +0800
Subject: [PATCH 10/15] Update basic RAG pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
只加了基本的 pipeline,还未进行测试,等具体接口确定之后进行调试
---
rag/requirements.txt | 4 +-
rag/src/config/config.py | 4 ++
rag/src/main.py | 28 +++++++++-
rag/src/util/pipeline.py | 114 +++++++++++++++++++++++++++++++++++++++
4 files changed, 147 insertions(+), 3 deletions(-)
create mode 100644 rag/src/util/pipeline.py
diff --git a/rag/requirements.txt b/rag/requirements.txt
index 08289b2..15f915c 100644
--- a/rag/requirements.txt
+++ b/rag/requirements.txt
@@ -1,4 +1,6 @@
sentence_transformers
transformers
numpy
-loguru
\ No newline at end of file
+loguru
+langchain
+torch
diff --git a/rag/src/config/config.py b/rag/src/config/config.py
index 4c7e335..b84327f 100644
--- a/rag/src/config/config.py
+++ b/rag/src/config/config.py
@@ -3,6 +3,7 @@ import os
cur_dir = os.path.dirname(os.path.abspath(__file__)) # config
src_dir = os.path.dirname(cur_dir) # src
base_dir = os.path.dirname(src_dir) # base
+model_repo = 'ajupyter/EmoLLM_aiwei'
# model
model_dir = os.path.join(base_dir, 'model') # model
@@ -17,3 +18,6 @@ knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl') # pickle
# log
log_dir = os.path.join(base_dir, 'log') # log
log_path = os.path.join(log_dir, 'log.log') # file
+
+select_num = 3
+retrieval_num = 10
\ No newline at end of file
diff --git a/rag/src/main.py b/rag/src/main.py
index 219ce85..97f60a0 100644
--- a/rag/src/main.py
+++ b/rag/src/main.py
@@ -5,8 +5,19 @@ import numpy as np
from typing import Tuple
from sentence_transformers import SentenceTransformer
-from config.config import knowledge_json_path, knowledge_pkl_path
+from config.config import knowledge_json_path, knowledge_pkl_path, model_repo
from util.encode import load_embedding, encode_qa
+from util.pipeline import EmoLLMRAG
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import streamlit as st
+from openxlab.model import download
+
+download(
+ model_repo=model_repo,
+ output='model'
+)
"""
@@ -62,6 +73,19 @@ def main():
## 2. 将 contents 拼接为 prompt,传给 LLM,作为 {已知内容}
## 3. 要求 LLM 根据已知内容回复
+@st.cache_resource
+def load_model():
+ model = (
+ AutoModelForCausalLM.from_pretrained("model", trust_remote_code=True)
+ .to(torch.bfloat16)
+ .cuda()
+ )
+ tokenizer = AutoTokenizer.from_pretrained("model", trust_remote_code=True)
+ return model, tokenizer
if __name__ == '__main__':
- main()
+ #main()
+ query = ''
+ model, tokenizer = load_model()
+ rag_obj = EmoLLMRAG(model)
+ response = rag_obj.main(query)
\ No newline at end of file
diff --git a/rag/src/util/pipeline.py b/rag/src/util/pipeline.py
new file mode 100644
index 0000000..a6f2cdf
--- /dev/null
+++ b/rag/src/util/pipeline.py
@@ -0,0 +1,114 @@
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from transformers.utils import logging
+
+from config.config import retrieval_num, select_num
+
+logger = logging.get_logger(__name__)
+
+
+class EmoLLMRAG(object):
+ """
+ EmoLLM RAG Pipeline
+ 1. 根据 query 进行 embedding
+ 2. 从 vector DB 中检索数据
+ 3. rerank 检索后的结果
+ 4. 将 query 和检索回来的 content 传入 LLM 中
+ """
+
+ def __init__(self, model) -> None:
+ """
+ 输入 Model 进行初始化
+
+ DataProcessing obj: 进行数据处理,包括数据 embedding/rerank
+ vectorstores: 加载vector DB。如果没有应该重新创建
+ system prompt: 获取预定义的 system prompt
+ prompt template: 定义最后的输入到 LLM 中的 template
+
+ """
+ self.model = model
+ self.vectorstores = self._load_vector_db()
+ self.system_prompt = self._get_system_prompt()
+ self.prompt_template = self._get_prompt_template()
+
+ # 等待 embedding team 封装对应接口
+ #self.data_process_obj = DataProcessing()
+
+ def _load_vector_db(self):
+ """
+ 调用 embedding 模块给出接口 load vector DB
+ """
+ return
+
+ def _get_system_prompt(self) -> str:
+ """
+ 加载 system prompt
+ """
+ return ''
+
+ def _get_prompt_template(self) -> str:
+ """
+ 加载 prompt template
+ """
+ return ''
+
+ def get_retrieval_content(self, query, rerank_flag=False) -> str:
+ """
+ Input: 用户提问, 是否需要rerank
+ ouput: 检索后并且 rerank 的内容
+ """
+
+ content = ''
+ documents = self.vectorstores.similarity_search(query, k=retrieval_num)
+
+ # 如果需要rerank,调用接口对 documents 进行 rerank
+ if rerank_flag:
+ pass
+ # 等后续调用接口
+ #documents = self.data_process_obj.rerank_documents(documents, select_num)
+
+ for doc in documents:
+ content += doc.page_content
+
+ return content
+
+ def generate_answer(self, query, content) -> str:
+ """
+ Input: 用户提问, 检索返回的内容
+ Output: 模型生成结果
+ """
+
+ # 构建 template
+ # 第一版不涉及 history 信息,因此将 system prompt 直接纳入到 template 之中
+ prompt = PromptTemplate(
+ template=self.prompt_template,
+ input_variables=["query", "content", "system_prompt"],
+ )
+
+ # 定义 chain
+ # output格式为 string
+ rag_chain = prompt | self.model | StrOutputParser()
+
+ # Run
+ generation = rag_chain.invoke(
+ {
+ "query": query,
+ "content": content,
+ "system_prompt": self.system_prompt
+ }
+ )
+ return generation
+
+ def main(self, query) -> str:
+ """
+ Input: 用户提问
+ output: LLM 生成的结果
+
+ 定义整个 RAG 的 pipeline 流程,调度各个模块
+ TODO:
+ 加入 RAGAS 评分系统
+ """
+ content = self.get_retrieval_content(query)
+ response = self.generate_answer(query, content)
+
+ return response
From b050fe8122aaf7c6e7f50b0718b90e330bbb024a Mon Sep 17 00:00:00 2001
From: edward_ke
Date: Sun, 17 Mar 2024 10:40:26 +0800
Subject: [PATCH 11/15] Update README_EN.md
---
rag/README_EN.md | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 66 insertions(+)
diff --git a/rag/README_EN.md b/rag/README_EN.md
index e69de29..df4fe43 100644
--- a/rag/README_EN.md
+++ b/rag/README_EN.md
@@ -0,0 +1,66 @@
+# EmoLLM RAG
+
+## **Module purpose**
+
+Based on the customer's questions, the corresponding information is retrieved to enhance the professionalism of the answer, making EmoLLM's answer more professional and reliable. Search content includes but is not limited to the following:
+- Psychology related theories
+- Psychology methodology
+- Classic Case
+- Customer background knowledge
+
+## **Datasets**
+
+
+- Cleaned QA pairs: Each QA pair is embedding as a sample
+- Filtered TXT texts
+ - Directly generate embedding for TXT text (segmented based on token length)
+ - Filter out irrelevant information such as directories and generate embedding for TXT text (segmented based on token length)
+ - After filtering irrelevant information such as directories, the TXT is semantically segmented to generate embedding.
+ - Split TXT according to the directory structure, and generate embeddings based on the architecture hierarchy.
+
+
+For details on data collection construction, please refer to [qa_generation_README](https://github.com/SmartFlowAI/EmoLLM/blob/ccfa75c493c4685e84073dfbc53c50c09a2988e3/scripts/qa_generation/README.md)
+
+## **Components**
+
+### [BCEmbedding](https://github.com/netease-youdao/BCEmbedding?tab=readme-ov-file)
+
+- [bce-embedding-base_v1](https://hf-mirror.com/maidalun1020/bce-embedding-base_v1): embedding model, used to build vector DB
+- [bce-reranker-base_v1](https://hf-mirror.com/maidalun1020/bce-reranker-base_v1): rerank model, used to rerank retrieved documents
+
+### [Langchain](https://python.langchain.com/docs/get_started)
+
+LangChain is an open source framework for building large language model (LLM) based applications. LangChain provides a variety of tools and abstractions to increase the customization, accuracy, and relevance of the information generated by your models.
+
+### [FAISS](https://faiss.ai/)
+
+FAISS is a library for efficient similarity search and dense vector clustering. It contains algorithms that can search sets of vectors of any size. Since langchain has integrated FAISS, this project will no longer be developed based on native documents. [FAISS in Langchain](https://python.langchain.com/docs/integrations/vectorstores/faiss)
+
+
+### [RAGAS](https://github.com/explodinggradients/ragas)
+
+RAG’s classic evaluation framework is evaluated through the following three aspects:
+
+- Faithfulness: The answers given should be generated based on the given context.
+- Answer Relevance: The generated answer should solve the actual question asked.
+- Context Relevance: The retrieved information should be highly concentrated and contain as little irrelevant information as possible.
+
+Later, more evaluation indicators were added, such as: context recall, etc.
+
+## **Detials**
+
+### RAG pipeline
+
+- Build vector DB based on data set
+- Embedding questions entered by customers
+- Search in vector database based on embedding results
+- Reorder recall data
+- Generate final results based on user questions and recall data
+
+**Noted**: The above process will only be carried out when the user chooses to use RAG
+
+### Follow-up actions
+
+- Add RAGAS evaluation results to the generation process. For example, when the generated results cannot solve the user's problem, it needs to be regenerated.
+- Add web retrieval to deal with the problem that the corresponding information cannot be retrieved in vector DB
+- Add multi-channel retrieval to increase recall rate. That is, multiple similar queries are generated based on user input for retrieval.
\ No newline at end of file
From 758b4d259c477e2b7192a246f3994fead9e134fd Mon Sep 17 00:00:00 2001
From: zealot52099 <67356208+zealot52099@users.noreply.github.com>
Date: Sun, 17 Mar 2024 17:39:17 +0800
Subject: [PATCH 12/15] update README.md and README_EN.md
---
README.md | 575 ++++++++++++++++++++++++------------------------
README_EN.md | 600 +++++++++++++++++++++++++--------------------------
2 files changed, 588 insertions(+), 587 deletions(-)
diff --git a/README.md b/README.md
index fdc606b..0249712 100644
--- a/README.md
+++ b/README.md
@@ -1,287 +1,288 @@
-
-
-
-
-
-**EmoLLM** is a series of large language models designed to understand, support and help customers in mental health counseling. It is fine-tuned from the LLM instructions. We really appreciate it if you could give it a star~⭐⭐. The open-sourced configuration is as follows:
-
-
-
-Everyone is welcome to contribute to this project ~
-
----
-
-The Model aims to fully understand and promote the mental health of individuals, groups, and society. This model typically includes the following key components:
-
-- Cognitive factors: Involving an individual's thought patterns, belief systems, cognitive biases, and problem-solving abilities. Cognitive factors significantly impact mental health as they affect how individuals interpret and respond to life events.
-- Emotional factors: Including emotion regulation, emotional expression, and emotional experiences. Emotional health is a crucial part of mental health, involving how individuals manage and express their emotions and how they recover from negative emotions.
-- Behavioral factors: Concerning an individual's behavior patterns, habits, and coping strategies. This includes stress management skills, social skills, and self-efficacy, which is the confidence in one's abilities.
-- Social environment: Comprising external factors such as family, work, community, and cultural background, which have direct and indirect impacts on an individual's mental health.
-- Physical health: There is a close relationship between physical and mental health. Good physical health can promote mental health and vice versa.
-- Psychological resilience: Refers to an individual's ability to recover from adversity and adapt. Those with strong psychological resilience can bounce back from challenges and learn and grow from them.
-- Prevention and intervention measures: The Mental Health Grand Model also includes strategies for preventing psychological issues and promoting mental health, such as psychological education, counseling, therapy, and social support systems.
-- Assessment and diagnostic tools: Effective promotion of mental health requires scientific tools to assess individuals' psychological states and diagnose potential psychological issues.
-### Recent Updates
-- 【2024.3.12】 Released on Baidu Flying Pulp Platform [aiwei](https://aistudio.baidu.com/community/app/63335)
-- 【2024.3.11】 **EmoLLM V2.0 is greatly improved in all scores compared to EmoLLM V1.0. Surpasses the performance of Role-playing ChatGPT on counseling tasks!** [Click to experience EmoLLM V2.0](https://openxlab.org.cn/apps/detail/Farewell1/EmoLLMV2.0), update [dataset statistics and details](./datasets/), [Roadmap](./assets/Roadmap_ZH.png)
-- 【2024.3.9】 Add concurrency acceleration [QA pair generation](./scripts/qa_generation/), [RAG pipeline](./rag/)
-- 【2024.3.3】 [Based on InternLM2-7B-chat full fine-tuned version EmoLLM V2.0 open sourced](https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_internlm2_7b_full), need two A100*80G, update professional evaluation, see [evaluate](./evaluate/), update PaddleOCR-based PDF to txt tool scripts, see [scripts](./scripts/).
-- 【2024.2.29】 Updated objective assessment calculations, see [evaluate](./evaluate/) for details. A series of datasets have also been updated, see [datasets](./datasets/) for details.
-- 【2024.2.27】 Updated English README and a series of datasets (licking dogs and one-round dialogue)
-- 【2024.2.23】The "Gentle Lady Psychologist Ai Wei" based on InternLM2_7B_chat_qlora was launched. [Click here to obtain the model weights](https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_aiwei), [configuration file](xtuner_config/aiwei-internlm2_chat_7b_qlora.py), [online experience link](https://openxlab.org.cn/apps/detail/ajupyter/EmoLLM-aiwei)
-
-- 【2024.2.23】Updated [several fine-tuning configurations](/xtuner_config/), added [data_pro.json](/datasets/data_pro.json) (more quantity, more comprehensive scenarios, richer content) and [aiwei.json](/datasets/aiwei.json) (dedicated to the gentle lady role-play, featuring Emoji expressions), the "Gentle Lady Psychologist Ai Wei" is coming soon.
-
-- 【2024.2.18】 The full fine-tuned version based on Qwen1_5-0_5B-Chat has been [open-sourced](https://www.modelscope.cn/models/aJupyter/EmoLLM_Qwen1_5-0_5B-Chat_full_sft/summary). Friends with limited computational resources can now dive in and explore it.
-
-
-
-View More
-
-- 【2024.2.6】 [Open-sourced based on the Qwen1_5-0_5B-Chat full-scale fine-tuned version](https://www.modelscope.cn/models/aJupyter/EmoLLM_Qwen1_5-0_5B-Chat_full_sft/summary), friends with limited computing power can start experimenting~
-
-
-
-
-
-- 【2024.2.5】 The project has been promoted by the official WeChat account NLP Engineering. Here's the [link](https://mp.weixin.qq.com/s/78lrRl2tlXEKUfElnkVx4A) to the article. Welcome everyone to follow!! 🥳🥳
-
-
-
-
-
-- 【2024.2.3】 [Project Vedio](https://www.bilibili.com/video/BV1N7421N76X/) at bilibili 😊
-- 【2024.1.27】 Complete data construction documentation, fine-tuning guide, deployment guide, Readme, and other related documents 👏
-- 【2024.1.25】 EmoLLM V1.0 has deployed online https://openxlab.org.cn/apps/detail/jujimeizuo/EmoLLM 😀
-
-
-
-### Roadmap
-
-
-
-
-
-
-## Contents
-
-- [EmoLLM - Large Language Model for Mental Health](#emollm---large-language-model-for-mental-health)
- - [Recent Updates](#recent-updates)
- - [Roadmap](#roadmap)
- - [Contents](#contents)
- - [Pre-development Configuration Requirements.](#pre-development-configuration-requirements)
- - [**User Guide**](#user-guide)
- - [File Directory Explanation](#file-directory-explanation)
- - [Data Construction](#data-construction)
- - [Fine-tuning Guide](#fine-tuning-guide)
- - [Deployment Guide](#deployment-guide)
- - [RAG (Retrieval Augmented Generation) Pipeline](#rag-retrieval-augmented-generation-pipeline)
- - [Frameworks Used](#frameworks-used)
- - [How to participate in this project](#how-to-participate-in-this-project)
- - [Version control](#version-control)
- - [Authors (in no particular order)](#authors-in-no-particular-order)
- - [Copyright Notice](#copyright-notice)
- - [Acknowledgments](#acknowledgments)
- - [Star History](#star-history)
- - [🌟 Contributors](#-contributors)
- - [Communication group](#communication-group)
-
-###### Pre-development Configuration Requirements.
-
-- A100 40G (specifically for InternLM2_7B_chat + qlora fine-tuning + deepspeed zero2 optimization)
-
-###### **User Guide**
-
-1. Clone the repo
-
-```sh
-git clone https://github.com/SmartFlowAI/EmoLLM.git
-```
-
-1. Read in sequence or read sections you're interested in:
- - [File Directory Explanation](#file-directory-explanation)
- - [Data Construction](#data-construction)
- - [Fine-tuning Guide](#fine-tuning-guide)
- - [Deployment Guide](#deployment-guide)
- - View More Details
-
-
-
-### File Directory Explanation
-
-```
-├─assets: Image Resources
-├─datasets: Dataset
-├─demo: demo scripts
-├─generate_data: Data Generation Guide
-│ └─xinghuo
-├─scripts: Some Available Tools
-└─xtuner_config:Fine-tuning Guide
- └─images
-```
-
-### Data Construction
-
-- Please read the [Data Construction Guide ](generate_data/tutorial.md)for reference.
-
-- The dataset used for this fine-tuning can be found at [datasets](datasets/data.json)
-
-### Fine-tuning Guide
-
-For details, see the [fine-tuning guide](xtuner_config/README.md)
-
-### Deployment Guide
-
-- Demo deployment: see [deployment guide](./demo/README.md) for details.
-- Quantitative deployment based on [LMDeploy](https://github.com/InternLM/lmdeploy/): see [deploy](./deploy/lmdeploy.md)
-
-
-### RAG (Retrieval Augmented Generation) Pipeline
-- See [RAG](./rag/)
-
-
-Additional Details
-
-### Frameworks Used
-
-- [Xtuner](https://github.com/InternLM/xtuner)
-- [Transformers](https://github.com/huggingface/transformers)
-- [Pytorch](https://pytorch.org/)
-- [LMDeploy](https://github.com/InternLM/lmdeploy/): for quantitative deployment
-- [Stremlit](https://streamlit.io/): for building demos
-- [DeepSpeed](https://github.com/microsoft/DeepSpeed): for parallel training
-- …
-
-#### How to participate in this project
-
-Contributions make the open-source community an excellent place for learning, inspiration, and creation. Any contribution you make is greatly appreciated.
-
-1. Fork the Project
-2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
-3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
-4. Push to the Branch (`git push origin feature/AmazingFeature`)
-5. Open a Pull Request
-
-### Version control
-
-This project uses Git for version control. You can see the currently available versions in the repository.
-
-
-
-### Authors (in no particular order)
-
-| Username | School/Organization | Remarks | Contributions |
-| :-------: | :-------------------: | :------------------: | :--------: |
-| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator |
-| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | |
-| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | |
-| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | |
-| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | |
-| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | |
-| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | |
-| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | |
-| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | |
-| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | |
-| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | |
-| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation|
-| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | |
-| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | |
-| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | |
-| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
-
-
-### Copyright Notice
-
-The project is licensed under the MIT License. Please refer to the details
- [LICENSE](https://github.com/aJupyter/EmoLLM/blob/master/LICENSE)
-
-### Acknowledgments
-
-- [Sanbu](https://github.com/sanbuphy)
-- [Shanghai Artificial Intelligence Laboratory](https://www.shlab.org.cn/)
-- [Vanin](https://github.com/vansin)
-- [Bloom up (WeChat Official Account Promotion)](https://mp.weixin.qq.com/s/78lrRl2tlXEKUfElnkVx4A)
-- Abu (M.A. in Psychology, Peking University)
-
-
-
-
-
-
-
-
-
-## Star History
-
-[![Star History Chart](https://api.star-history.com/svg?repos=SmartFlowAI/EmoLLM&type=Date)](https://star-history.com/#SmartFlowAI/EmoLLM&Date)
-
-## 🌟 Contributors
-
-[![EmoLLM contributors](https://contrib.rocks/image?repo=SmartFlowAI/EmoLLM&max=50)](https://github.com/SmartFlowAI/EmoLLM/graphs/contributors)
-
-[your-project-path]: SmartflowAI/EmoLLM
-[contributors-shield]: https://img.shields.io/github/contributors/SmartflowAI/EmoLLM.svg?style=flat-square
-[contributors-url]: https://github.com/SmartflowAI/EmoLLM/graphs/contributors
-[forks-shield]: https://img.shields.io/github/forks/SmartflowAI/EmoLLM.svg?style=flat-square
-[forks-url]: https://github.com/SmartflowAI/EmoLLM/network/members
-[stars-shield]: https://img.shields.io/github/stars/SmartflowAI/EmoLLM.svg?style=flat-square
-[stars-url]: https://github.com/SmartflowAI/EmoLLM/stargazers
-[issues-shield]: https://img.shields.io/github/issues/SmartflowAI/EmoLLM.svg?style=flat-square
-[issues-url]: https://img.shields.io/github/issues/SmartflowAI/EmoLLM.svg
-[license-shield]: https://img.shields.io/github/license/SmartflowAI/EmoLLM.svg?style=flat-square
-[license-url]: https://github.com/SmartflowAI/EmoLLM/blob/main/LICENSE
-
-[OpenXLab_App-image]: https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg
-[OpenXLab_Model-image]: https://cdn-static.openxlab.org.cn/header/openxlab_models.svg
-[OpenXLab_App-url]: https://openxlab.org.cn/apps/detail/Farewell1/EmoLLMV2.0
-[OpenXLab_Model-url]: https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_internlm2_7b_full
-
-## Communication group
-- If it fails, go to the Issue section.
-
-
-
-
+
+
+# EmoLLM - Large Language Model for Mental Health
+
+
+
+
+
+
+
+
+**EmoLLM** is a series of large language models designed to understand, support and help customers in mental health counseling. It is fine-tuned from the LLM instructions. We really appreciate it if you could give it a star~⭐⭐. The open-sourced configuration is as follows:
+
+
+
+Everyone is welcome to contribute to this project ~
+
+---
+
+The Model aims to fully understand and promote the mental health of individuals, groups, and society. This model typically includes the following key components:
+
+- Cognitive factors: Involving an individual's thought patterns, belief systems, cognitive biases, and problem-solving abilities. Cognitive factors significantly impact mental health as they affect how individuals interpret and respond to life events.
+- Emotional factors: Including emotion regulation, emotional expression, and emotional experiences. Emotional health is a crucial part of mental health, involving how individuals manage and express their emotions and how they recover from negative emotions.
+- Behavioral factors: Concerning an individual's behavior patterns, habits, and coping strategies. This includes stress management skills, social skills, and self-efficacy, which is the confidence in one's abilities.
+- Social environment: Comprising external factors such as family, work, community, and cultural background, which have direct and indirect impacts on an individual's mental health.
+- Physical health: There is a close relationship between physical and mental health. Good physical health can promote mental health and vice versa.
+- Psychological resilience: Refers to an individual's ability to recover from adversity and adapt. Those with strong psychological resilience can bounce back from challenges and learn and grow from them.
+- Prevention and intervention measures: The Mental Health Grand Model also includes strategies for preventing psychological issues and promoting mental health, such as psychological education, counseling, therapy, and social support systems.
+- Assessment and diagnostic tools: Effective promotion of mental health requires scientific tools to assess individuals' psychological states and diagnose potential psychological issues.
+### Recent Updates
+- 【2024.3.12】 Released on Baidu Flying Pulp Platform [aiwei](https://aistudio.baidu.com/community/app/63335)
+- 【2024.3.11】 **EmoLLM V2.0 is greatly improved in all scores compared to EmoLLM V1.0. Surpasses the performance of Role-playing ChatGPT on counseling tasks!** [Click to experience EmoLLM V2.0](https://openxlab.org.cn/apps/detail/Farewell1/EmoLLMV2.0), update [dataset statistics and details](./datasets/), [Roadmap](./assets/Roadmap_ZH.png)
+- 【2024.3.9】 Add concurrency acceleration [QA pair generation](./scripts/qa_generation/), [RAG pipeline](./rag/)
+- 【2024.3.3】 [Based on InternLM2-7B-chat full fine-tuned version EmoLLM V2.0 open sourced](https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_internlm2_7b_full), need two A100*80G, update professional evaluation, see [evaluate](./evaluate/), update PaddleOCR-based PDF to txt tool scripts, see [scripts](./scripts/).
+- 【2024.2.29】 Updated objective assessment calculations, see [evaluate](./evaluate/) for details. A series of datasets have also been updated, see [datasets](./datasets/) for details.
+- 【2024.2.27】 Updated English README and a series of datasets (licking dogs and one-round dialogue)
+- 【2024.2.23】The "Gentle Lady Psychologist Ai Wei" based on InternLM2_7B_chat_qlora was launched. [Click here to obtain the model weights](https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_aiwei), [configuration file](xtuner_config/aiwei-internlm2_chat_7b_qlora.py), [online experience link](https://openxlab.org.cn/apps/detail/ajupyter/EmoLLM-aiwei)
+
+- 【2024.2.23】Updated [several fine-tuning configurations](/xtuner_config/), added [data_pro.json](/datasets/data_pro.json) (more quantity, more comprehensive scenarios, richer content) and [aiwei.json](/datasets/aiwei.json) (dedicated to the gentle lady role-play, featuring Emoji expressions), the "Gentle Lady Psychologist Ai Wei" is coming soon.
+
+- 【2024.2.18】 The full fine-tuned version based on Qwen1_5-0_5B-Chat has been [open-sourced](https://www.modelscope.cn/models/aJupyter/EmoLLM_Qwen1_5-0_5B-Chat_full_sft/summary). Friends with limited computational resources can now dive in and explore it.
+
+
+
+View More
+
+- 【2024.2.6】 [Open-sourced based on the Qwen1_5-0_5B-Chat full-scale fine-tuned version](https://www.modelscope.cn/models/aJupyter/EmoLLM_Qwen1_5-0_5B-Chat_full_sft/summary), friends with limited computing power can start experimenting~
+
+
+
+
+
+- 【2024.2.5】 The project has been promoted by the official WeChat account NLP Engineering. Here's the [link](https://mp.weixin.qq.com/s/78lrRl2tlXEKUfElnkVx4A) to the article. Welcome everyone to follow!! 🥳🥳
+
+
+
+
+
+- 【2024.2.3】 [Project Vedio](https://www.bilibili.com/video/BV1N7421N76X/) at bilibili 😊
+- 【2024.1.27】 Complete data construction documentation, fine-tuning guide, deployment guide, Readme, and other related documents 👏
+- 【2024.1.25】 EmoLLM V1.0 has deployed online https://openxlab.org.cn/apps/detail/jujimeizuo/EmoLLM 😀
+
+
+
+### Roadmap
+
+
+
+
+
+
+## Contents
+
+- [EmoLLM - Large Language Model for Mental Health](#emollm---large-language-model-for-mental-health)
+ - [Recent Updates](#recent-updates)
+ - [Roadmap](#roadmap)
+ - [Contents](#contents)
+ - [Pre-development Configuration Requirements.](#pre-development-configuration-requirements)
+ - [**User Guide**](#user-guide)
+ - [File Directory Explanation](#file-directory-explanation)
+ - [Data Construction](#data-construction)
+ - [Fine-tuning Guide](#fine-tuning-guide)
+ - [Deployment Guide](#deployment-guide)
+ - [RAG (Retrieval Augmented Generation) Pipeline](#rag-retrieval-augmented-generation-pipeline)
+ - [Frameworks Used](#frameworks-used)
+ - [How to participate in this project](#how-to-participate-in-this-project)
+ - [Version control](#version-control)
+ - [Authors (in no particular order)](#authors-in-no-particular-order)
+ - [Copyright Notice](#copyright-notice)
+ - [Acknowledgments](#acknowledgments)
+ - [Star History](#star-history)
+ - [🌟 Contributors](#-contributors)
+ - [Communication group](#communication-group)
+
+###### Pre-development Configuration Requirements.
+
+- A100 40G (specifically for InternLM2_7B_chat + qlora fine-tuning + deepspeed zero2 optimization)
+
+###### **User Guide**
+
+1. Clone the repo
+
+```sh
+git clone https://github.com/SmartFlowAI/EmoLLM.git
+```
+
+1. Read in sequence or read sections you're interested in:
+ - [File Directory Explanation](#file-directory-explanation)
+ - [Data Construction](#data-construction)
+ - [Fine-tuning Guide](#fine-tuning-guide)
+ - [Deployment Guide](#deployment-guide)
+ - View More Details
+
+
+
+### File Directory Explanation
+
+```
+├─assets: Image Resources
+├─datasets: Dataset
+├─demo: demo scripts
+├─generate_data: Data Generation Guide
+│ └─xinghuo
+├─scripts: Some Available Tools
+└─xtuner_config:Fine-tuning Guide
+ └─images
+```
+
+### Data Construction
+
+- Please read the [Data Construction Guide ](generate_data/tutorial.md)for reference.
+
+- The dataset used for this fine-tuning can be found at [datasets](datasets/data.json)
+
+### Fine-tuning Guide
+
+For details, see the [fine-tuning guide](xtuner_config/README.md)
+
+### Deployment Guide
+
+- Demo deployment: see [deployment guide](./demo/README.md) for details.
+- Quantitative deployment based on [LMDeploy](https://github.com/InternLM/lmdeploy/): see [deploy](./deploy/lmdeploy.md)
+
+
+### RAG (Retrieval Augmented Generation) Pipeline
+- See [RAG](./rag/)
+
+
+Additional Details
+
+### Frameworks Used
+
+- [Xtuner](https://github.com/InternLM/xtuner)
+- [Transformers](https://github.com/huggingface/transformers)
+- [Pytorch](https://pytorch.org/)
+- [LMDeploy](https://github.com/InternLM/lmdeploy/): for quantitative deployment
+- [Stremlit](https://streamlit.io/): for building demos
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed): for parallel training
+- …
+
+#### How to participate in this project
+
+Contributions make the open-source community an excellent place for learning, inspiration, and creation. Any contribution you make is greatly appreciated.
+
+1. Fork the Project
+2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
+3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
+4. Push to the Branch (`git push origin feature/AmazingFeature`)
+5. Open a Pull Request
+
+### Version control
+
+This project uses Git for version control. You can see the currently available versions in the repository.
+
+
+
+### Authors (in no particular order)
+
+| Username | School/Organization | Remarks | Contributions |
+| :-------: | :-------------------: | :------------------: | :--------: |
+| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator |
+| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | |
+| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | |
+| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | |
+| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | |
+| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | |
+| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | |
+| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | |
+| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | |
+| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | |
+| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | |
+| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation|
+| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | |
+| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | |
+| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | |
+| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
+| [zealot52099](https://github.com/zealot52099) | AI Mover | | |
+
+### Copyright Notice
+
+The project is licensed under the MIT License. Please refer to the details
+ [LICENSE](https://github.com/aJupyter/EmoLLM/blob/master/LICENSE)
+
+### Acknowledgments
+
+- [Sanbu](https://github.com/sanbuphy)
+- [Shanghai Artificial Intelligence Laboratory](https://www.shlab.org.cn/)
+- [Vanin](https://github.com/vansin)
+- [Bloom up (WeChat Official Account Promotion)](https://mp.weixin.qq.com/s/78lrRl2tlXEKUfElnkVx4A)
+- Abu (M.A. in Psychology, Peking University)
+
+
+
+
+
+
+
+
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=SmartFlowAI/EmoLLM&type=Date)](https://star-history.com/#SmartFlowAI/EmoLLM&Date)
+
+## 🌟 Contributors
+
+[![EmoLLM contributors](https://contrib.rocks/image?repo=SmartFlowAI/EmoLLM&max=50)](https://github.com/SmartFlowAI/EmoLLM/graphs/contributors)
+
+[your-project-path]: SmartflowAI/EmoLLM
+[contributors-shield]: https://img.shields.io/github/contributors/SmartflowAI/EmoLLM.svg?style=flat-square
+[contributors-url]: https://github.com/SmartflowAI/EmoLLM/graphs/contributors
+[forks-shield]: https://img.shields.io/github/forks/SmartflowAI/EmoLLM.svg?style=flat-square
+[forks-url]: https://github.com/SmartflowAI/EmoLLM/network/members
+[stars-shield]: https://img.shields.io/github/stars/SmartflowAI/EmoLLM.svg?style=flat-square
+[stars-url]: https://github.com/SmartflowAI/EmoLLM/stargazers
+[issues-shield]: https://img.shields.io/github/issues/SmartflowAI/EmoLLM.svg?style=flat-square
+[issues-url]: https://img.shields.io/github/issues/SmartflowAI/EmoLLM.svg
+[license-shield]: https://img.shields.io/github/license/SmartflowAI/EmoLLM.svg?style=flat-square
+[license-url]: https://github.com/SmartflowAI/EmoLLM/blob/main/LICENSE
+
+[OpenXLab_App-image]: https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg
+[OpenXLab_Model-image]: https://cdn-static.openxlab.org.cn/header/openxlab_models.svg
+[OpenXLab_App-url]: https://openxlab.org.cn/apps/detail/Farewell1/EmoLLMV2.0
+[OpenXLab_Model-url]: https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_internlm2_7b_full
+
+## Communication group
+- If it fails, go to the Issue section.
+
+
+
+
From 428b24f7a1be110971d7e39c52e0659a5e57a5d5 Mon Sep 17 00:00:00 2001
From: zealot52099 <67356208+zealot52099@users.noreply.github.com>
Date: Sun, 17 Mar 2024 17:44:33 +0800
Subject: [PATCH 13/15] updata README.md and README_EN.md
---
README.md | 2 +-
README_EN.md | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 0249712..a5c8fdd 100644
--- a/README.md
+++ b/README.md
@@ -220,7 +220,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
| [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | |
| [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士(研0) | | |
| [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | |
-| [zealot52099](https://github.com/zealot52099) | AI搬用工 | | |
+| [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG|
### 版权说明
diff --git a/README_EN.md b/README_EN.md
index 07fc980..a8a5a3e 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -244,7 +244,7 @@ This project uses Git for version control. You can see the currently available v
| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | |
| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | |
| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
-| [zealot52099](https://github.com/zealot52099) | AI Mover | | |
+| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG|
### Copyright Notice
From 88218bfd4b9d57056669416984e4e1594e7b2ffc Mon Sep 17 00:00:00 2001
From: santiagoTOP <“1537211712top@gmail.com”>
Date: Sun, 17 Mar 2024 20:37:26 +0800
Subject: [PATCH 14/15] Update RAG README
---
scripts/qa_generation/Clean_QA.md | 11 ---------
scripts/qa_generation/README.md | 31 +++++++++++++++++++++++++
scripts/qa_generation/README_EN.md | 37 ++++++++++++++++++++++++++++++
3 files changed, 68 insertions(+), 11 deletions(-)
delete mode 100644 scripts/qa_generation/Clean_QA.md
diff --git a/scripts/qa_generation/Clean_QA.md b/scripts/qa_generation/Clean_QA.md
deleted file mode 100644
index 9e0b6ec..0000000
--- a/scripts/qa_generation/Clean_QA.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# 清洗 QA 对
-调用qwen去判断当前QA对是否属于心理学范畴,去除非心理学范畴的 QA 对
-
-## Step 1
-1. 准备好需要清洗的 QA 对数据
-2. 将该数据放进 model 同级 data 文件夹下
-3. 根据文件夹名去修改 config/config.py 中的 judge_dir。我个人没有对文件名进行更改,所以我的judge_dir是 judge_dir = os.path.join(data_dir, '数据整合')
-
-## Step 2
-1. 运行QA_clean.py即可
-2. 清洗完的 QA 对会以 jsonl 的格式存在 data/cleaned 下
\ No newline at end of file
diff --git a/scripts/qa_generation/README.md b/scripts/qa_generation/README.md
index 874427a..b0339a7 100644
--- a/scripts/qa_generation/README.md
+++ b/scripts/qa_generation/README.md
@@ -93,3 +93,34 @@
## **步骤四:清洗QA对**
- 清洗目的
+
+ - 提高提取的QA数据质量,清理掉与心理学无关的QA对
+
+- 清洗方法
+
+ - 使用Prompt方法,驱动LLM对给出的QA对进行判断
+
+ - **参考Prompt**
+
+ - ```markdown
+ 你是一名经验丰富的心理咨询师,熟悉心理学相关知识。根据我提供的 QA 对,来判断这个 QA 对是否属于心理学范畴。
+
+ 标准如下:
+
+ - 若当前 QA 对属于心理学范畴,则返回1
+ - 若当前 QA 对不属于心理学范畴,则返回0
+
+
+ 以下是给定的心理学 QA 对内容:
+ ```
+
+- 清洗工具
+ - 配置`config/config.py` 中的 `DASHSCOPE_API_KEY`,`API_KEY`获取方法见步骤三
+ - 使用提供的清洗脚本[QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
+
+- 使用方法
+ - 准备好需要清洗的 QA 对数据
+ - 将该数据放进 model 同级 data 文件夹下
+ - 根据文件夹名去修改 `config/config.py` 中的 `judge_dir`。
+ - 如存储数据的文件名为`xxx`,则`judge_dir`是 `judge_dir = os.path.join(data_dir, 'xxx')`
+ - 清洗完的 QA 对会以 `jsonl` 的格式存在 `data/cleaned` 下
diff --git a/scripts/qa_generation/README_EN.md b/scripts/qa_generation/README_EN.md
index b2768df..112b07f 100644
--- a/scripts/qa_generation/README_EN.md
+++ b/scripts/qa_generation/README_EN.md
@@ -93,3 +93,40 @@ Using books specialized in psychology to build QA knowledge pairs for RAG to pro
## **Step 4: Cleaning of QA pairs**
- Purpose of cleaning
+ - Improve the quality of extracted QA data and clean out QA pairs that are not relevant to psychology
+
+- Cleaning Methods
+
+ - Use the Prompt method to drive the LLM to make a judgment on the given QA pairs
+
+ - **Reference to Prompt**
+
+ - ```markdown
+ You are an experienced counselor and are familiar with psychology. Based on the QA pair I have provided, determine if this QA pair is psychological in nature.
+
+ The criteria are as follows:
+
+ - If the current QA pair belongs to the category of psychology, then return 1
+ - If the current QA pair does not belong to the category of psychology, then return 0.
+
+
+ The following is the content of the given psychology QA pair:
+ ```
+
+- Cleaning Tools
+
+ - Configure `DASHSCOPE_API_KEY` in `config/config.py`, see step 3 for how to get `API_KEY`.
+
+ - Use the provided cleaning script [QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
+
+- How to use
+
+ - Prepare the QA pair data to be cleaned
+
+ - Put the data into the data folder of the same level as the model.
+
+ - Modify `judge_dir` in `config/config.py` according to the folder name.
+
+ - If the file name of the stored data is `xxx`, then `judge_dir` is `judge_dir = os.path.join(data_dir, 'xxx')`.
+
+ - The cleaned QA pairs are stored as `jsonl` under `data/cleaned`.
From bf174f6e024b18895a2810f640b1c8c240b55b9b Mon Sep 17 00:00:00 2001
From: HongCheng
Date: Mon, 18 Mar 2024 21:45:04 +0900
Subject: [PATCH 15/15] Update tutorial.md update part 5 and 6
---
generate_data/tutorial.md | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/generate_data/tutorial.md b/generate_data/tutorial.md
index 996a823..80426b4 100644
--- a/generate_data/tutorial.md
+++ b/generate_data/tutorial.md
@@ -100,7 +100,10 @@
5. **数据集整合**
- 在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。我们需要check.py进行检查数据。最后再使用merge_json.py将所有的json整合为一个总的json文件。
+ 在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。
+
+* 首先使用`check.py`进行数据检查。
+* 然后使用`merge_json.py`将所有的json整合为一个总的json文件。
6. **评估与优化**