This commit is contained in:
王友昉 2024-03-16 13:12:15 +08:00
parent 5485b4c124
commit 9bcd4e060b
18 changed files with 328 additions and 170 deletions

.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files

.idea/EmoLLM.iml Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />

.idea/aws.xml Normal file
View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="accountSettings">
<option name="activeRegion" value="us-east-1" />
<option name="recentlyUsedRegions">
<option value="us-east-1" />

View File

@ -0,0 +1,14 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<list size="1">
<item index="0" class="java.lang.String" itemvalue="tnt" />

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />

.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (transformers) (2)" project-jdk-type="Python SDK" />

.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<module fileurl="file://$PROJECT_DIR$/.idea/EmoLLM.iml" filepath="$PROJECT_DIR$/.idea/EmoLLM.iml" />

.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />

View File

@ -0,0 +1,11 @@
# 清洗 QA 对
调用qwen去判断当前QA对是否属于心理学范畴去除非心理学范畴的 QA 对
## Step 1
1. 准备好需要清洗的 QA 对数据
2. 将该数据放进 model 同级 data 文件夹下
3. 根据文件夹名去修改 config/ 中的 judge_dir。我个人没有对文件名进行更改所以我的judge_dir是 judge_dir = os.path.join(data_dir, '数据整合')
## Step 2
1. 运行QA_clean.py即可
2. 清洗完的 QA 对会以 jsonl 的格式存在 data/cleaned 下

View File

@ -0,0 +1,105 @@
import os
import json
import time
from tqdm import tqdm
import concurrent.futures
from datetime import datetime
import numpy as np
from config.config import result_dir, clean_dir, storage_interval, window_size, overlap_size, multi_process_num
from model.qwen import call_qwen_single_turn, call_qwen_Psychology_QA_Pairs
from util.logger import get_logger
from util.data_loader import get_jsonl_file_paths, get_file_list, get_QA_pairs, get_txt_content, capture_qa, merge_sub_qa_generation, save_to_file
logger = get_logger()
def single_thread_generate(thread_num, interval, model_caller, storage_jsonl_path, contents):
storage_counter = 0
judge_list = []
for content in tqdm(contents):
response = model_caller(content)
if response == '1':
content = json.loads(content)
storage_counter += 1
if storage_counter % interval == 0:
save_to_file(storage_jsonl_path, judge_list)
storage_counter = 0
judge_list = []
except Exception as exc:
logger.error("QA generation error : %s" % (exc))
# 最后,如果 storage_list 中还有剩余内容,也会将其保存到文件中。
if judge_list:
save_to_file(storage_jsonl_path, judge_list)
judge_list = []
生成 QA
model_name: 可调用的模型名称暂时只实现了 qwen
interval: 存储间隔即每隔多少条存一次文件过密的间隔会增大 IO 开销
def clean_qa(
model_name: str = 'qwen',
interval: int = 10,
# current_time ="%Y-%m-%d_%H-%M-%S")
if model_name == 'qwen':
model_caller = call_qwen_Psychology_QA_Pairs
logger.warning('This model is currently not supported and will call the default model - qwen.')
model_caller = call_qwen_Psychology_QA_Pairs
model_name = 'qwen''The called model is: {model_name}.')'The storage interval is: {interval}.')
file_lists = get_jsonl_file_paths() # 数据整合文件夹下所有.jsonl文件的地址
for file_path in file_lists:
# 一个jsonl文件的所有QA Pairs
contents = get_QA_pairs(file_path)
file_name = os.path.basename(file_path)
storage_jsonl_path = os.path.join(
clean_dir, f'{file_name}')'The generated QA will be stored in {storage_jsonl_path}.')
contents_array = np.array(contents)
chunks = np.array_split(contents_array, multi_process_num)
# 构建并发参数 list
parameters_list = list()
for thread_num, chunk in enumerate(chunks):
[thread_num, interval, model_caller, storage_jsonl_path, list(chunk)]
with concurrent.futures.ThreadPoolExecutor(max_workers=multi_process_num) as executor:
futures = [executor.submit(single_thread_generate, *parameters) for parameters in parameters_list]
for future in concurrent.futures.as_completed(futures):
except Exception as exc:
logger.error("Thread generated an exception: %s" % (exc))
merge_sub_qa_generation(result_dir, storage_jsonl_path)
if __name__ == '__main__':
# 创建cleaned文件夹
os.makedirs('./data/cleaned', exist_ok=True)

View File

@ -1,95 +1,37 @@
# RAG数据库构建流程
# QA Generation Pipeline
## **构建目的**
## 1. 使用方法
1. 检查 `requirements.txt` 中的依赖是否满足。
2. 调整代码中 `system_prompt`确保与repo最新版本一致保证生成QA的多样性和稳定性。
3. 将txt文件放到与 `model`同级目录 `data`文件夹中.
4. 在 `config/` 配置所需的 API KEY`` 启动即可。生成的 QA 对会以 jsonl 的格式存在 `data/generated` 下。
## **构建流程**
### 1.1 API KEY 获取方法
## **步骤一PDF to TXT**
目前仅包含了 qwen。
- 目的
- 将收集到的PDF版本的心理学书籍转化为TXT文本文件方便后续的信息提取。
#### 1.1.1 Qwen
- 所需工具
前往[模型服务灵积-API-KEY管理 (](,点击”创建新的 API-KEY“将获取的 API KEY 填至 `config/` 中的 `DASHSCOPE_API_KEY` 即可。
- [pdf2txt](
## 2. 注意事项
- [PaddleORC处理PDF用法参考](
### 2.1 系统提示 System Prompt
- 安装必要的python库
注意,目前的解析方案是基于模型会生成 markdown 包裹的 json 块的前提的,更改 system prompt 时需要保证这一点不变。
pip install paddlepaddle
pip install opencv-python
pip install paddleocr
### 2.2 滑动窗口 Sliding Window
- 注意
- 如果无法使用**pip install paddleocr**安装paddleocr可以考虑采用whl文件安装[下载地址](
- 脚本启动方式采用命令行启动python [PDF存放的文件名]
滑动窗口的 `window_size``overlap_size` 都可以在 `util/` 中的 `get_txt_content` 函数中更改。目前是按照句子分割的滑动窗口。
## **步骤二筛选PDF**
### 2.3 书本文件格式 Corpus Format
- 筛选目的
目前仅支持了 txt 格式,可以将清洗好的书籍文本放在 `data` 文件夹下,程序会递归检索该文件夹下的所有 txt 文件。
- 利用LLM去除非专业心理学书籍
- 筛选标准,包含心理咨询相关内容,如:
- 心理咨询流派 - 具体咨询方法
- 心理疾病 - 疾病特征
- 心理疾病 - 治疗方法
- 筛选方式:
- 根据标题初筛
- 若无法判断属于心理咨询相关书籍利用kimi/GLM-4查询是否包含心理咨询相关知识建议一次仅查询一本书
- ```markdown
心理咨询流派 - 具体咨询方法
心理疾病 - 疾病特征
心理疾病 - 治疗方法
## **步骤三提取QA对**
- 根据书籍内容利用LLM高效构造QA知识对
- 提取流程
- 准备处理好的txt文本数据
- 按要求配置[脚本文件](
- 根据自己的需求或者提取的结果合理修改window_size和overlap_size
- 使用方法
- 检查 `requirements.txt` 中的依赖是否满足。
- 调整代码中 `system_prompt`确保与repo最新版本一致保证生成QA的多样性和稳定性。
- 将txt文件放到与 `model`同级目录 `data`文件夹中.
- 在 `config/` 配置所需的 API KEY`` 启动即可。生成的 QA 对会以 jsonl 的格式存在 `data/generated` 下。
- API KEY 获取方法
- 目前仅包含了 qwen。
- Qwen
- 前往[模型服务灵积-API-KEY管理 (](,点击”创建新的 API-KEY“将获取的 API KEY 填至 `config/` 中的 `DASHSCOPE_API_KEY` 即可。
- 注意事项
- 系统提示 System Prompt
- 注意,目前的解析方案是基于模型会生成 markdown 包裹的 json 块的前提的,更改 system prompt 时需要保证这一点不变。
- 滑动窗口 Sliding Window
- 滑动窗口的 `window_size``overlap_size` 都可以在 `util/` 中的 `get_txt_content` 函数中更改。目前是按照句子分割的滑动窗口。
- 书本文件格式 Corpus Format
- 目前仅支持了 txt 格式,可以将清洗好的书籍文本放在 `data` 文件夹下,程序会递归检索该文件夹下的所有 txt 文件。
## **步骤四清洗QA对**
- 清洗目的
1. 支持更多模型Gemini、GPT、ChatGLM……
2. 支持多线程调用模型
3. 支持更多文本格式PDF……
4. 支持更多切分文本的方式

View File

@ -1,95 +1,37 @@
# RAG Database Building Process
# QA Generation Pipeline
## **Constructive purpose**
## 1. Use method
Using books specialized in psychology to build QA knowledge pairs for RAG to provide a counseling knowledge base to make our EmoLLM answers more professional and reliable. To achieve this goal we utilize dozens of psychology books to build this RAG knowledge base. The main building process is as follows:
1. Check whether the dependencies in `requirements.txt` are satisfied.
2. Adjust the `system_prompt`in the code to ensure that it is consistent with the latest version of the repo to ensure the diversity and stability of the generated QA.
3. Put the txt file into the `data` folder in the same directory as `model`.
4. Configure the required API KEY in `config/` and start from ``. The generated QA pairs are stored in the jsonl format under `data/generated`.
## **Build process**
### 1.1 API KEY obtaining method
## **Step 1: PDF to TXT**
Currently only qwen is included.
- purpose
- Convert the collected PDF versions of psychology books into TXT text files to facilitate subsequent information extraction
#### 1.1.1 Qwen
- Tools required
To[model service spirit product - API - KEY management (]( on "create a new API - KEY", Fill in the obtained API KEY to `DASHSCOPE_API_KEY` in `config/`.
- [pdf2txt](
## 2. Precautions
- [PaddleORC Processing PDF Usage Reference](
### 2.1 The System Prompt is displayed
- Install necessary python libraries
Note that the current parsing scheme is based on the premise that the model generates json blocks of markdown wraps, and you need to make sure that this remains the case when you change the system prompt.
pip install paddlepaddle
pip install opencv-python
pip install paddleocr
### 2.2 Sliding Window
- precautionary
- If you are unable to install paddleocr using **pip install paddleocr**, consider using the whl file installation, [download address](
- Script startup method using the command line to start: python [PDF file name stored in the]
Both `window_size` and `overlap_size` of the sliding window can be changed in the `get_txt_content` function in `util/` Currently it is a sliding window divided by sentence.
## **Step 2: Screening PDF**
### 2.3 Corpus Format
- Purpose of screening
At present, only txt format is supported, and the cleaned book text can be placed under the `data` folder, and the program will recursively retrieve all txt files under the folder.
- Using the LLM to go to non-professional psychology books
- Screening criteria that include counseling related content such as:
- Schools of Counseling - Specific Counseling Methods
- Mental Illness - Characteristics of the Disease
- Mental Illness - Treatment
- Screening method:
- Initial screening based on title
- If you can't tell if it is a counseling-related book, use kimi/GLM-4 to check if it contains counseling-related knowledge (it is recommended to check only one book at a time)
- ```markdown
Reference prompt.
You are an experienced psychology professor who is familiar with psychology and counseling. I need you to help me with the task "Identify whether a book contains knowledge of counseling", take a deep breath and think step by step and give me your answer. If your answer satisfies me, I will give you a 10w tip!
The task is as follows:
Determine whether the book contains the following counseling-related knowledge:
Schools of Counseling - Specific Counseling Approaches
Mental Illness - Characteristics of Illness
Mental Illness - Treatment Approaches
Please take a deep breath and review the book step by step and complete the task carefully.
## **Step 3: Extraction of QA pairs**
- According to the content of the book, use LLM to efficiently construct QA knowledge on the
- Withdrawal process
- Prepare processed txt text data
- Configuration on request [script file](
- Modify window_size and overlap_size reasonably according to your own needs or extraction results.
- Usage
- Checks if the dependencies in `requirements.txt` are satisfied.
- Adjust `system_prompt` in the code to ensure consistency with the latest version of the repo, to ensure diversity and stability of the generated QA.
- Place the txt file in the `data` folder in the same directory as the `model`.
- Configure the required API KEYs in `config/` and start from ``. The generated QA pairs are stored in jsonl format under `data/generated`.
- API KEY Getting Methods
- Currently only qwen is included.
- Qwen
- Go to [Model Service LingJi - API-KEY Management (](, click "Create New API-KEY", and fill in the obtained API KEY into the Click "Create new API-KEY", fill in the obtained API KEY to `DASHSCOPE_API_KEY` in `config/`.
- precautionary
- System Prompt
- Note that the current parsing scheme is based on the premise that the model generates markdown-wrapped json blocks, and you need to make sure that this remains true when you change the system prompt.
- Sliding Window
- The `window_size` and `overlap_size` of the sliding window can be changed in the `get_txt_content` function in `util/`. Currently the sliding window is split by sentence.
- Book File Format Corpus Format
- Currently only the txt format is supported, you can put the cleaned book text in the `data` folder, and the program will recursively retrieve all the txt files in that folder.
## **Step 4: Cleaning of QA pairs**
- Purpose of cleaning
1. Support more models (Gemini, GPT, ChatGLM...)
2. Support multi-threaded call model
3. Support more text formats (PDF...)
4. Support more ways to split text

View File

@ -0,0 +1,11 @@
你是一名经验丰富的心理咨询师,熟悉心理学相关知识。我将向我的来访者解决心理问题,需要一定的心理学知识支持。请你根据我提供的 QA 对,判断其是否属于心理学范畴。请深呼吸并一步一步思考,给出你最正确的判断!
- 心理学范畴:"心理学知识,心理咨询方法, 心理疾病特征, 心理疾病治疗方法"等主题。要求是适合对话心理咨询的知识,去掉作者、时间、背景故事等无关内容.
- 判断标准如下:
1.若当前 QA 对属于心理学范畴,则返回 "1".
2.若当前 QA 对不属于心理学范畴,则返回 "0".
以下是给定的心理学 QA 对内容:

View File

@ -10,7 +10,9 @@ base_dir = os.path.dirname(cur_dir) # ba
model_dir = os.path.join(base_dir, 'model') # model
# data
data_dir = os.path.join(base_dir, 'data') # data
data_dir = os.path.join(base_dir, 'data') # /Users/wangyoufang/Downloads/EmoLLM/scripts/qa_generation/data
clean_dir = os.path.join(data_dir, 'cleaned')
judge_dir = os.path.join(data_dir, '数据整合')
result_dir = os.path.join(data_dir, 'generated') # result
# log
@ -18,7 +20,9 @@ log_dir = os.path.join(base_dir, 'log') # lo
log_file_path = os.path.join(log_dir, 'log.log') # file
# system prompt
# Prompt内容
system_prompt_file_path = os.path.join(base_dir, '') # system prompt
wash_prompt_file_path = os.path.join(base_dir, '')
@ -28,11 +32,11 @@ system_prompt_file_path = os.path.join(base_dir, '') # sy
storage_interval = 10
window_size = 8
overlap_size = 2
multi_process_num = 3
multi_process_num = 1

View File

@ -24,6 +24,7 @@ def single_thread_generate(thread_num, interval, model_caller, storage_jsonl_pat
for content in tqdm(contents):
response = model_caller(content)
captured_qa = capture_qa(response)
if captured_qa is None:
@ -70,6 +71,7 @@ def generate_qa(
storage_list = []
for file_path in file_list:
contents = get_txt_content(file_path, window_size=window_size, overlap_size=overlap_size)
storage_list = []
_, file_name = os.path.split(file_path)
@ -77,7 +79,7 @@ def generate_qa(
result_dir, f'{current_time}-{file_name}-{model_name}.jsonl')'The generated QA will be stored in {storage_jsonl_path}.')
# 基于并发个数切分 contents 内容
contents_array = np.array(contents)
chunks = np.array_split(contents_array, multi_process_num)
@ -89,8 +91,9 @@ def generate_qa(
# 并发生成 QA 对
# 使用 ThreadPoolExecutor 创建一个线程池,其中 max_workers=multi_process_num 指定了线程池中最大的线程数。
with concurrent.futures.ThreadPoolExecutor(max_workers=multi_process_num) as executor:
# 创建一个Future列表它们将对应每个worker_function的结果
# 循环调用 single_thread_generate 函数,每次赋予参数 parameters
futures = [executor.submit(single_thread_generate, *parameters) for parameters in parameters_list]
for future in concurrent.futures.as_completed(futures):
@ -99,8 +102,10 @@ def generate_qa(
except Exception as exc:
logger.error("Thread generated an exception: %s" % (exc))
# 最后调用 merge_sub_qa_generation 函数,将各个子任务生成的 QA 对合并到一个文件中。汇总整个处理过程的结果。
merge_sub_qa_generation(result_dir, storage_jsonl_path)
if __name__ == '__main__':
# 创建generated文件夹

View File

@ -5,7 +5,7 @@ from dashscope.api_entities.dashscope_response import Role
from config.config import DASHSCOPE_API_KEY
from util.logger import get_logger
from util.prompt_loader import load_system_prompt
from util.prompt_loader import load_system_prompt, load_wash_prompt
dashscope.api_key = DASHSCOPE_API_KEY
@ -17,7 +17,35 @@ def call_qwen_single_turn(query: str) -> str:
messages = [
'role': Role.SYSTEM,
'content': load_system_prompt()
'content': load_system_prompt() # 读取Prompt内容(
'role': Role.USER,
'content': query
response =
if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content']
logger.error('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
response.request_id, response.status_code,
response.code, response.message
return ""
def call_qwen_Psychology_QA_Pairs(query: str) -> str:
messages = [
'role': Role.SYSTEM,
'content': load_wash_prompt()
'role': Role.USER,

View File

@ -4,11 +4,41 @@ import json
import glob
from typing import List, Dict
from config.config import data_dir
from config.config import data_dir, judge_dir
from util.logger import get_logger
logger = get_logger()
递归获取 数据整合 下的所有 .jsonl 文件列表
def get_jsonl_file_paths() -> List[str]:
json_file_paths = []
# 遍历根目录及其所有子目录
for dirpath, dirnames, filenames in os.walk(judge_dir):
# 对每个文件进行检查
for filename in filenames:
# 使用正则表达式匹配以.jsonl结尾的文件名
if'\.jsonl$', filename):
# 构建完整的文件路径并添加到列表中
json_file_path = os.path.join(dirpath, filename)
return json_file_paths
def get_QA_pairs(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
content =
# 按照换行符分割字符串
QA_Pairs = content.split('\n')
return QA_Pairs
递归获取 data_dir 下的所有 .txt 文件列表
@ -25,11 +55,14 @@ def get_file_list() -> List[str]:
logger.warning(f'No txt text found in {data_dir}, please check!')
return txt_files
获取 txt 文本的所有内容按句子返回 List
file_path: txt 文本路径
window_size: 滑窗大小单位为句子数
overlap_size: 重叠大小单位为句子数
def get_txt_content(
file_path: str,
@ -47,7 +80,7 @@ def get_txt_content(
res = []
sentences_amount = len(sentences)
start_index, end_index = 0, sentences_amount - window_size
## check length
# check length
if window_size < overlap_size:
logger.error("window_size must be greater than or equal to overlap_size")
return None
@ -56,7 +89,7 @@ def get_txt_content(
return ['\n'.join(sentences)]
for i in range(start_index, end_index + 1, overlap_size):
res.append('\n'.join(sentences[i : i + window_size]))
res.append('\n'.join(sentences[i: i + window_size]))
return res
@ -80,6 +113,7 @@ def capture_qa(content: str) -> List[Dict]:
logger.warning("No JSON block found.")
return None
storage_list 存入到 storage_jsonl_path
@ -88,6 +122,7 @@ def save_to_file(storage_jsonl_path, storage_list):
for item in storage_list:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
@ -104,3 +139,7 @@ def merge_sub_qa_generation(directory, storage_jsonl_path):
save_to_file(storage_jsonl_path, file_contents)
if __name__ == '__main_':

View File

@ -1,7 +1,14 @@
from config.config import system_prompt_file_path
from config.config import wash_prompt_file_path
def load_system_prompt() -> str:
with open(system_prompt_file_path, 'r', encoding='utf-8') as f:
system_prompt =
return system_prompt
def load_wash_prompt() -> str:
with open(wash_prompt_file_path, 'r', encoding='utf-8') as f:
wash_prompt =
return wash_prompt