Merge pull request #96 from zxazys/main

框架图
This commit is contained in:
xzw 2024-03-18 21:23:25 +08:00 committed by GitHub
commit 4a36ff428a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 824 additions and 604 deletions

6
.gitignore vendored
View File

@ -6,6 +6,11 @@ data/
pdf/ pdf/
.idea/ .idea/
*.jsonl
*.json
# ./generate_data/*.josnl
# ./generate_data/*/*/*.josnl
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
@ -169,3 +174,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/

View File

@ -116,11 +116,19 @@
<img src="assets/Roadmap_ZH.png" alt="Roadmap_ZH"> <img src="assets/Roadmap_ZH.png" alt="Roadmap_ZH">
</a> </a>
### 🎯框架图
<p align="center">
<a href="https://github.com/aJupyter/EmoLLM/">
<img src="assets/框架图.png" alt="Roadmap_ZH">
</a>
## 目录 ## 目录
- [EmoLLM-心理健康大模型](#emollm-心理健康大模型) - [EmoLLM-心理健康大模型](#emollm-心理健康大模型)
- [🎇最近更新](#最近更新) - [🎇最近更新](#最近更新)
- [🎯路线图](#路线图) - [🎯路线图](#路线图)
- [🎯框架图](#框架图)
- [目录](#目录) - [目录](#目录)
- [开发前的配置要求](#开发前的配置要求) - [开发前的配置要求](#开发前的配置要求)
- [**使用指南**](#使用指南) - [**使用指南**](#使用指南)
@ -220,6 +228,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
| [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | | | [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | |
| [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士研0 | | | | [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士研0 | | |
| [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | | | [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | |
| [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG|
### 版权说明 ### 版权说明

View File

@ -244,7 +244,7 @@ This project uses Git for version control. You can see the currently available v
| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | | [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | |
| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | | [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | |
| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | | | [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG|
### Copyright Notice ### Copyright Notice

BIN
assets/框架图.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 204 KiB

View File

@ -0,0 +1,60 @@
import json
import os
def save_merge_json(data_lis, file_path):
with open(file_path, 'wt', encoding='utf-8') as file:
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
def get_all_file_paths(folder_path, file_type='.jsonl'):
# 确保传入的是一个目录
if not os.path.isdir(folder_path):
raise ValueError(f"{folder_path} is not a valid directory")
# 获取文件夹下所有文件的路径
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
return file_paths
if __name__ == '__main__':
conversion_lis = []
folder_path = r'./'
merge_path = folder_path.split('/')[-1]
try:
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
except:
merge_last_path = ''
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
for path in get_all_file_paths(folder_path):
print(path)
with open(path, 'rt', encoding='utf-8') as file:
for line in file:
# # 移除行尾的换行符
# if line == '\n':
# line = line.rstrip('\n')
line = line.rstrip('\n')
# 解析JSON
try:
data = json.loads(line)
conversion_lis.append(data)
# conversion_lis.append('\n')
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
if merge_last_path!='':
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
elif merge_path!='':
save_merge_json_path = rf'./{merge_path}_merge.json'
else:
save_merge_json_path = rf'./curr_merge.json'
save_merge_json(data_lis=conversion_lis,
file_path=save_merge_json_path)
print(len(conversion_lis),save_merge_json_path)

View File

@ -0,0 +1,75 @@
import json
import os
def save_merge_json(data_lis, file_path):
with open(file_path, 'wt', encoding='utf-8') as file:
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
def get_all_file_paths(folder_path, file_type='.jsonl'):
# 确保传入的是一个目录
if not os.path.isdir(folder_path):
raise ValueError(f"{folder_path} is not a valid directory")
# 获取文件夹下所有文件的路径
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
return file_paths
if __name__ == '__main__':
data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt
# data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt
root_dir = rf'./{data_ai}/'
save_final_merge_json_path = f'{data_ai}_final_merge.json'
subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
final_list = []
for folder_path in subfolders:
conversion_lis = []
merge_path = folder_path.split('/')[-1]
try:
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
except:
merge_last_path = ''
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
for path in get_all_file_paths(folder_path):
print(path)
with open(path, 'rt', encoding='utf-8') as file:
for line in file:
# # 移除行尾的换行符
# if line == '\n':
# line = line.rstrip('\n')
line = line.rstrip('\n')
# 解析JSON
try:
data = json.loads(line)
conversion_lis.append(data)
# conversion_lis.append('\n')
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
if merge_last_path!='':
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
elif merge_path!='':
save_merge_json_path = rf'./{merge_path}_merge.json'
else:
save_merge_json_path = rf'./curr_merge.json'
save_merge_json(data_lis=conversion_lis,
file_path=save_merge_json_path)
final_list = final_list+conversion_lis
print(len(conversion_lis),len(final_list),save_merge_json_path)
save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
print(save_final_merge_json_path)

View File

@ -100,7 +100,10 @@
5. **数据集整合** 5. **数据集整合**
在进行数据集整合之前我们要检查生成的数据是否存在格式错误类型不符合等情况。我们需要check.py进行检查数据。最后再使用merge_json.py将所有的json整合为一个总的json文件。 在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。
* 首先使用`check.py`进行数据检查。
* 然后使用`merge_json.py`将所有的json整合为一个总的json文件。
6. **评估与优化** 6. **评估与优化**

View File

@ -34,11 +34,21 @@ def zhipu_api(data, emo):
top_p = round(random.uniform(0.1, 0.9), 2) top_p = round(random.uniform(0.1, 0.9), 2)
messages = getText('user', prompt) messages = getText('user', prompt)
response = client.chat.completions.create(
model='glm-4', # Error code: 400, with error text {"error":{"code":"1301","message":
messages=messages, # "系统检测到输入或生成内容可能包含不安全或敏感内容,请您避免输入易产生敏感内容的提示语,感谢您的配合。"}}
top_p=top_p, try:
) response = client.chat.completions.create(
model='glm-4',
messages=messages,
top_p=top_p,
)
except:
response = client.chat.completions.create(
model='glm-4',
messages=messages,
top_p=top_p,
)
return response.choices[0].message.content return response.choices[0].message.content

View File

@ -1,11 +0,0 @@
# 清洗 QA 对
调用qwen去判断当前QA对是否属于心理学范畴去除非心理学范畴的 QA 对
## Step 1
1. 准备好需要清洗的 QA 对数据
2. 将该数据放进 model 同级 data 文件夹下
3. 根据文件夹名去修改 config/config.py 中的 judge_dir。我个人没有对文件名进行更改所以我的judge_dir是 judge_dir = os.path.join(data_dir, '数据整合')
## Step 2
1. 运行QA_clean.py即可
2. 清洗完的 QA 对会以 jsonl 的格式存在 data/cleaned 下

View File

@ -93,3 +93,34 @@
## **步骤四清洗QA对** ## **步骤四清洗QA对**
- 清洗目的 - 清洗目的
- 提高提取的QA数据质量清理掉与心理学无关的QA对
- 清洗方法
- 使用Prompt方法驱动LLM对给出的QA对进行判断
- **参考Prompt**
- ```markdown
你是一名经验丰富的心理咨询师,熟悉心理学相关知识。根据我提供的 QA 对,来判断这个 QA 对是否属于心理学范畴。
标准如下:
- 若当前 QA 对属于心理学范畴则返回1
- 若当前 QA 对不属于心理学范畴则返回0
以下是给定的心理学 QA 对内容:
```
- 清洗工具
- 配置`config/config.py` 中的 `DASHSCOPE_API_KEY`,`API_KEY`获取方法见步骤三
- 使用提供的清洗脚本[QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
- 使用方法
- 准备好需要清洗的 QA 对数据
- 将该数据放进 model 同级 data 文件夹下
- 根据文件夹名去修改 `config/config.py` 中的 `judge_dir`
- 如存储数据的文件名为`xxx`,则`judge_dir`是 `judge_dir = os.path.join(data_dir, 'xxx')`
- 清洗完的 QA 对会以 `jsonl` 的格式存在 `data/cleaned`

View File

@ -93,3 +93,40 @@ Using books specialized in psychology to build QA knowledge pairs for RAG to pro
## **Step 4: Cleaning of QA pairs** ## **Step 4: Cleaning of QA pairs**
- Purpose of cleaning - Purpose of cleaning
- Improve the quality of extracted QA data and clean out QA pairs that are not relevant to psychology
- Cleaning Methods
- Use the Prompt method to drive the LLM to make a judgment on the given QA pairs
- **Reference to Prompt**
- ```markdown
You are an experienced counselor and are familiar with psychology. Based on the QA pair I have provided, determine if this QA pair is psychological in nature.
The criteria are as follows:
- If the current QA pair belongs to the category of psychology, then return 1
- If the current QA pair does not belong to the category of psychology, then return 0.
The following is the content of the given psychology QA pair:
```
- Cleaning Tools
- Configure `DASHSCOPE_API_KEY` in `config/config.py`, see step 3 for how to get `API_KEY`.
- Use the provided cleaning script [QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
- How to use
- Prepare the QA pair data to be cleaned
- Put the data into the data folder of the same level as the model.
- Modify `judge_dir` in `config/config.py` according to the folder name.
- If the file name of the stored data is `xxx`, then `judge_dir` is `judge_dir = os.path.join(data_dir, 'xxx')`.
- The cleaned QA pairs are stored as `jsonl` under `data/cleaned`.