commit
4a36ff428a
6
.gitignore
vendored
6
.gitignore
vendored
@ -6,6 +6,11 @@ data/
|
|||||||
pdf/
|
pdf/
|
||||||
.idea/
|
.idea/
|
||||||
|
|
||||||
|
*.jsonl
|
||||||
|
*.json
|
||||||
|
# ./generate_data/*.josnl
|
||||||
|
# ./generate_data/*/*/*.josnl
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
@ -169,3 +174,4 @@ cython_debug/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
@ -116,11 +116,19 @@
|
|||||||
<img src="assets/Roadmap_ZH.png" alt="Roadmap_ZH">
|
<img src="assets/Roadmap_ZH.png" alt="Roadmap_ZH">
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
### 🎯框架图
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://github.com/aJupyter/EmoLLM/">
|
||||||
|
<img src="assets/框架图.png" alt="Roadmap_ZH">
|
||||||
|
</a>
|
||||||
|
|
||||||
## 目录
|
## 目录
|
||||||
|
|
||||||
- [EmoLLM-心理健康大模型](#emollm-心理健康大模型)
|
- [EmoLLM-心理健康大模型](#emollm-心理健康大模型)
|
||||||
- [🎇最近更新](#最近更新)
|
- [🎇最近更新](#最近更新)
|
||||||
- [🎯路线图](#路线图)
|
- [🎯路线图](#路线图)
|
||||||
|
- [🎯框架图](#框架图)
|
||||||
- [目录](#目录)
|
- [目录](#目录)
|
||||||
- [开发前的配置要求](#开发前的配置要求)
|
- [开发前的配置要求](#开发前的配置要求)
|
||||||
- [**使用指南**](#使用指南)
|
- [**使用指南**](#使用指南)
|
||||||
@ -220,6 +228,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
|
|||||||
| [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | |
|
| [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | |
|
||||||
| [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士(研0) | | |
|
| [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士(研0) | | |
|
||||||
| [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | |
|
| [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | |
|
||||||
|
| [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG|
|
||||||
|
|
||||||
### 版权说明
|
### 版权说明
|
||||||
|
|
||||||
|
@ -244,7 +244,7 @@ This project uses Git for version control. You can see the currently available v
|
|||||||
| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | |
|
| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | |
|
||||||
| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | |
|
| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | |
|
||||||
| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
|
| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
|
||||||
|
| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG|
|
||||||
|
|
||||||
### Copyright Notice
|
### Copyright Notice
|
||||||
|
|
||||||
|
BIN
assets/框架图.png
Normal file
BIN
assets/框架图.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 204 KiB |
60
generate_data/final_data/merge_jsonl.py
Normal file
60
generate_data/final_data/merge_jsonl.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def save_merge_json(data_lis, file_path):
|
||||||
|
with open(file_path, 'wt', encoding='utf-8') as file:
|
||||||
|
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_file_paths(folder_path, file_type='.jsonl'):
|
||||||
|
# 确保传入的是一个目录
|
||||||
|
if not os.path.isdir(folder_path):
|
||||||
|
raise ValueError(f"{folder_path} is not a valid directory")
|
||||||
|
|
||||||
|
# 获取文件夹下所有文件的路径
|
||||||
|
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
|
||||||
|
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
|
||||||
|
return file_paths
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
conversion_lis = []
|
||||||
|
|
||||||
|
folder_path = r'./'
|
||||||
|
|
||||||
|
merge_path = folder_path.split('/')[-1]
|
||||||
|
try:
|
||||||
|
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
|
||||||
|
except:
|
||||||
|
merge_last_path = ''
|
||||||
|
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
|
||||||
|
|
||||||
|
|
||||||
|
for path in get_all_file_paths(folder_path):
|
||||||
|
print(path)
|
||||||
|
|
||||||
|
with open(path, 'rt', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# # 移除行尾的换行符
|
||||||
|
# if line == '\n':
|
||||||
|
# line = line.rstrip('\n')
|
||||||
|
line = line.rstrip('\n')
|
||||||
|
# 解析JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
conversion_lis.append(data)
|
||||||
|
# conversion_lis.append('\n')
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error decoding JSON: {e}")
|
||||||
|
|
||||||
|
if merge_last_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
|
||||||
|
elif merge_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_path}_merge.json'
|
||||||
|
else:
|
||||||
|
save_merge_json_path = rf'./curr_merge.json'
|
||||||
|
|
||||||
|
save_merge_json(data_lis=conversion_lis,
|
||||||
|
file_path=save_merge_json_path)
|
||||||
|
print(len(conversion_lis),save_merge_json_path)
|
75
generate_data/final_data/merge_jsonl_r.py
Normal file
75
generate_data/final_data/merge_jsonl_r.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def save_merge_json(data_lis, file_path):
|
||||||
|
with open(file_path, 'wt', encoding='utf-8') as file:
|
||||||
|
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_file_paths(folder_path, file_type='.jsonl'):
|
||||||
|
# 确保传入的是一个目录
|
||||||
|
if not os.path.isdir(folder_path):
|
||||||
|
raise ValueError(f"{folder_path} is not a valid directory")
|
||||||
|
|
||||||
|
# 获取文件夹下所有文件的路径
|
||||||
|
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
|
||||||
|
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
|
||||||
|
return file_paths
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt
|
||||||
|
# data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt
|
||||||
|
root_dir = rf'./{data_ai}/'
|
||||||
|
|
||||||
|
save_final_merge_json_path = f'{data_ai}_final_merge.json'
|
||||||
|
|
||||||
|
subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
|
||||||
|
|
||||||
|
final_list = []
|
||||||
|
for folder_path in subfolders:
|
||||||
|
conversion_lis = []
|
||||||
|
merge_path = folder_path.split('/')[-1]
|
||||||
|
try:
|
||||||
|
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
|
||||||
|
except:
|
||||||
|
merge_last_path = ''
|
||||||
|
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
|
||||||
|
|
||||||
|
|
||||||
|
for path in get_all_file_paths(folder_path):
|
||||||
|
print(path)
|
||||||
|
|
||||||
|
with open(path, 'rt', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# # 移除行尾的换行符
|
||||||
|
# if line == '\n':
|
||||||
|
# line = line.rstrip('\n')
|
||||||
|
line = line.rstrip('\n')
|
||||||
|
# 解析JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
conversion_lis.append(data)
|
||||||
|
# conversion_lis.append('\n')
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error decoding JSON: {e}")
|
||||||
|
|
||||||
|
if merge_last_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
|
||||||
|
elif merge_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_path}_merge.json'
|
||||||
|
else:
|
||||||
|
save_merge_json_path = rf'./curr_merge.json'
|
||||||
|
|
||||||
|
save_merge_json(data_lis=conversion_lis,
|
||||||
|
file_path=save_merge_json_path)
|
||||||
|
|
||||||
|
final_list = final_list+conversion_lis
|
||||||
|
print(len(conversion_lis),len(final_list),save_merge_json_path)
|
||||||
|
|
||||||
|
save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
|
||||||
|
print(save_final_merge_json_path)
|
||||||
|
|
||||||
|
|
@ -100,7 +100,10 @@
|
|||||||
|
|
||||||
5. **数据集整合**
|
5. **数据集整合**
|
||||||
|
|
||||||
在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。我们需要check.py进行检查数据。最后再使用merge_json.py将所有的json整合为一个总的json文件。
|
在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。
|
||||||
|
|
||||||
|
* 首先使用`check.py`进行数据检查。
|
||||||
|
* 然后使用`merge_json.py`将所有的json整合为一个总的json文件。
|
||||||
|
|
||||||
6. **评估与优化**
|
6. **评估与优化**
|
||||||
|
|
||||||
|
@ -34,11 +34,21 @@ def zhipu_api(data, emo):
|
|||||||
|
|
||||||
top_p = round(random.uniform(0.1, 0.9), 2)
|
top_p = round(random.uniform(0.1, 0.9), 2)
|
||||||
messages = getText('user', prompt)
|
messages = getText('user', prompt)
|
||||||
response = client.chat.completions.create(
|
|
||||||
model='glm-4',
|
# Error code: 400, with error text {"error":{"code":"1301","message":
|
||||||
messages=messages,
|
# "系统检测到输入或生成内容可能包含不安全或敏感内容,请您避免输入易产生敏感内容的提示语,感谢您的配合。"}}
|
||||||
top_p=top_p,
|
try:
|
||||||
)
|
response = client.chat.completions.create(
|
||||||
|
model='glm-4',
|
||||||
|
messages=messages,
|
||||||
|
top_p=top_p,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model='glm-4',
|
||||||
|
messages=messages,
|
||||||
|
top_p=top_p,
|
||||||
|
)
|
||||||
|
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
@ -1,11 +0,0 @@
|
|||||||
# 清洗 QA 对
|
|
||||||
调用qwen去判断当前QA对是否属于心理学范畴,去除非心理学范畴的 QA 对
|
|
||||||
|
|
||||||
## Step 1
|
|
||||||
1. 准备好需要清洗的 QA 对数据
|
|
||||||
2. 将该数据放进 model 同级 data 文件夹下
|
|
||||||
3. 根据文件夹名去修改 config/config.py 中的 judge_dir。我个人没有对文件名进行更改,所以我的judge_dir是 judge_dir = os.path.join(data_dir, '数据整合')
|
|
||||||
|
|
||||||
## Step 2
|
|
||||||
1. 运行QA_clean.py即可
|
|
||||||
2. 清洗完的 QA 对会以 jsonl 的格式存在 data/cleaned 下
|
|
@ -93,3 +93,34 @@
|
|||||||
## **步骤四:清洗QA对**
|
## **步骤四:清洗QA对**
|
||||||
|
|
||||||
- 清洗目的
|
- 清洗目的
|
||||||
|
|
||||||
|
- 提高提取的QA数据质量,清理掉与心理学无关的QA对
|
||||||
|
|
||||||
|
- 清洗方法
|
||||||
|
|
||||||
|
- 使用Prompt方法,驱动LLM对给出的QA对进行判断
|
||||||
|
|
||||||
|
- **参考Prompt**
|
||||||
|
|
||||||
|
- ```markdown
|
||||||
|
你是一名经验丰富的心理咨询师,熟悉心理学相关知识。根据我提供的 QA 对,来判断这个 QA 对是否属于心理学范畴。
|
||||||
|
|
||||||
|
标准如下:
|
||||||
|
|
||||||
|
- 若当前 QA 对属于心理学范畴,则返回1
|
||||||
|
- 若当前 QA 对不属于心理学范畴,则返回0
|
||||||
|
|
||||||
|
|
||||||
|
以下是给定的心理学 QA 对内容:
|
||||||
|
```
|
||||||
|
|
||||||
|
- 清洗工具
|
||||||
|
- 配置`config/config.py` 中的 `DASHSCOPE_API_KEY`,`API_KEY`获取方法见步骤三
|
||||||
|
- 使用提供的清洗脚本[QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
|
||||||
|
|
||||||
|
- 使用方法
|
||||||
|
- 准备好需要清洗的 QA 对数据
|
||||||
|
- 将该数据放进 model 同级 data 文件夹下
|
||||||
|
- 根据文件夹名去修改 `config/config.py` 中的 `judge_dir`。
|
||||||
|
- 如存储数据的文件名为`xxx`,则`judge_dir`是 `judge_dir = os.path.join(data_dir, 'xxx')`
|
||||||
|
- 清洗完的 QA 对会以 `jsonl` 的格式存在 `data/cleaned` 下
|
||||||
|
@ -93,3 +93,40 @@ Using books specialized in psychology to build QA knowledge pairs for RAG to pro
|
|||||||
## **Step 4: Cleaning of QA pairs**
|
## **Step 4: Cleaning of QA pairs**
|
||||||
|
|
||||||
- Purpose of cleaning
|
- Purpose of cleaning
|
||||||
|
- Improve the quality of extracted QA data and clean out QA pairs that are not relevant to psychology
|
||||||
|
|
||||||
|
- Cleaning Methods
|
||||||
|
|
||||||
|
- Use the Prompt method to drive the LLM to make a judgment on the given QA pairs
|
||||||
|
|
||||||
|
- **Reference to Prompt**
|
||||||
|
|
||||||
|
- ```markdown
|
||||||
|
You are an experienced counselor and are familiar with psychology. Based on the QA pair I have provided, determine if this QA pair is psychological in nature.
|
||||||
|
|
||||||
|
The criteria are as follows:
|
||||||
|
|
||||||
|
- If the current QA pair belongs to the category of psychology, then return 1
|
||||||
|
- If the current QA pair does not belong to the category of psychology, then return 0.
|
||||||
|
|
||||||
|
|
||||||
|
The following is the content of the given psychology QA pair:
|
||||||
|
```
|
||||||
|
|
||||||
|
- Cleaning Tools
|
||||||
|
|
||||||
|
- Configure `DASHSCOPE_API_KEY` in `config/config.py`, see step 3 for how to get `API_KEY`.
|
||||||
|
|
||||||
|
- Use the provided cleaning script [QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
|
||||||
|
|
||||||
|
- How to use
|
||||||
|
|
||||||
|
- Prepare the QA pair data to be cleaned
|
||||||
|
|
||||||
|
- Put the data into the data folder of the same level as the model.
|
||||||
|
|
||||||
|
- Modify `judge_dir` in `config/config.py` according to the folder name.
|
||||||
|
|
||||||
|
- If the file name of the stored data is `xxx`, then `judge_dir` is `judge_dir = os.path.join(data_dir, 'xxx')`.
|
||||||
|
|
||||||
|
- The cleaned QA pairs are stored as `jsonl` under `data/cleaned`.
|
||||||
|
Loading…
Reference in New Issue
Block a user