Merge pull request #96 from zxazys/main

框架图
2024-03-18 21:23:25 +08:00 · 2024-03-18 21:23:25 +08:00 · 4a36ff428a
commit 4a36ff428a
parent 4473c924f7 28086d3f89
11 changed files with 824 additions and 604 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,11 @@ data/
 pdf/
 .idea/
 *.jsonl
 *.json
 # ./generate_data/*.josnl
 # ./generate_data/*/*/*.josnl
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@ -169,3 +174,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/README.md
+++ b/README.md
@ -116,11 +116,19 @@
    <img src="assets/Roadmap_ZH.png" alt="Roadmap_ZH">
  </a>
 ### 🎯框架图
 <p align="center">
  <a href="https://github.com/aJupyter/EmoLLM/">
    <img src="assets/框架图.png" alt="Roadmap_ZH">
  </a>
 ## 目录
 - [EmoLLM-心理健康大模型](#emollm-心理健康大模型)
    - [🎇最近更新](#最近更新)
    - [🎯路线图](#路线图)
    - [🎯框架图](#框架图)
  - [目录](#目录)
          - [开发前的配置要求](#开发前的配置要求)
          - [**使用指南**](#使用指南)
@ -220,6 +228,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
 | [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 |  |  |
 | [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士（研0） |  |  |
 | [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | |
 | [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG|
 ### 版权说明
--- a/README_EN.md
+++ b/README_EN.md
@ -244,7 +244,7 @@ This project uses Git for version control. You can see the currently available v
 | [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student |  |  |
 | [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) |  |  |
 | [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
-
+| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG|
 ### Copyright Notice
--- a/assets/框架图.png
+++ b/assets/框架图.png
--- a/generate_data/final_data/merge_jsonl.py
+++ b/generate_data/final_data/merge_jsonl.py
@ -0,0 +1,60 @@
 import json
 import os
 def save_merge_json(data_lis, file_path):
    with open(file_path, 'wt', encoding='utf-8') as file:
        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
 def get_all_file_paths(folder_path, file_type='.jsonl'):
    # 确保传入的是一个目录
    if not os.path.isdir(folder_path):
        raise ValueError(f"{folder_path} is not a valid directory")
    # 获取文件夹下所有文件的路径
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
    return file_paths
 if __name__ == '__main__':
    conversion_lis = []
    folder_path = r'./'
    merge_path = folder_path.split('/')[-1]
    try:
        merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
    except:
        merge_last_path = '' 
    print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
    for path in get_all_file_paths(folder_path):
        print(path)
        with open(path, 'rt', encoding='utf-8') as file:
            for line in file:
                # # 移除行尾的换行符
                # if line == '\n':
                #     line = line.rstrip('\n')
                line = line.rstrip('\n')
                # 解析JSON
                try:
                    data = json.loads(line)
                    conversion_lis.append(data)
                    # conversion_lis.append('\n')
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
    if merge_last_path!='':
        save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
    elif merge_path!='':
        save_merge_json_path = rf'./{merge_path}_merge.json'
    else:
        save_merge_json_path = rf'./curr_merge.json'
    save_merge_json(data_lis=conversion_lis,
                    file_path=save_merge_json_path)
    print(len(conversion_lis),save_merge_json_path)
--- a/generate_data/final_data/merge_jsonl_r.py
+++ b/generate_data/final_data/merge_jsonl_r.py
@ -0,0 +1,75 @@
 import json
 import os
 def save_merge_json(data_lis, file_path):
    with open(file_path, 'wt', encoding='utf-8') as file:
        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
 def get_all_file_paths(folder_path, file_type='.jsonl'):
    # 确保传入的是一个目录
    if not os.path.isdir(folder_path):
        raise ValueError(f"{folder_path} is not a valid directory")
    # 获取文件夹下所有文件的路径
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
    return file_paths
 if __name__ == '__main__':
    data_ai = 'qwen'  # python merge_jsonl_r.py > qwen.txt
    # data_ai = 'zhipuai'  # python merge_jsonl_r.py > zhipuai.txt
    root_dir  = rf'./{data_ai}/'
    save_final_merge_json_path = f'{data_ai}_final_merge.json'
    subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
    final_list = []
    for folder_path in subfolders:
        conversion_lis = []
        merge_path = folder_path.split('/')[-1]
        try:
            merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
        except:
            merge_last_path = '' 
        print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
        for path in get_all_file_paths(folder_path):
            print(path)
            with open(path, 'rt', encoding='utf-8') as file:
                for line in file:
                    # # 移除行尾的换行符
                    # if line == '\n':
                    #     line = line.rstrip('\n')
                    line = line.rstrip('\n')
                    # 解析JSON
                    try:
                        data = json.loads(line)
                        conversion_lis.append(data)
                        # conversion_lis.append('\n')
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON: {e}")
            if merge_last_path!='':
                save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
            elif merge_path!='':
                save_merge_json_path = rf'./{merge_path}_merge.json'
            else:
                save_merge_json_path = rf'./curr_merge.json'
            save_merge_json(data_lis=conversion_lis,
                            file_path=save_merge_json_path)
        final_list = final_list+conversion_lis
        print(len(conversion_lis),len(final_list),save_merge_json_path)
    save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
    print(save_final_merge_json_path)
--- a/generate_data/tutorial.md
+++ b/generate_data/tutorial.md
@ -100,7 +100,10 @@
 5. **数据集整合**
-   在进行数据集整合之前，我们要检查生成的数据是否存在格式错误，类型不符合等情况。我们需要check.py进行检查数据。最后再使用merge_json.py将所有的json整合为一个总的json文件。
+   在进行数据集整合之前，我们要检查生成的数据是否存在格式错误，类型不符合等情况。
 * 首先使用`check.py`进行数据检查。
 * 然后使用`merge_json.py`将所有的json整合为一个总的json文件。
 6. **评估与优化**
--- a/generate_data/zhipuai_gen_data.py
+++ b/generate_data/zhipuai_gen_data.py
@ -34,11 +34,21 @@ def zhipu_api(data, emo):
    top_p = round(random.uniform(0.1, 0.9), 2)
    messages = getText('user', prompt)
-    response = client.chat.completions.create(
+    
-        model='glm-4',
+    # Error code: 400, with error text {"error":{"code":"1301","message":
-        messages=messages,
+    # "系统检测到输入或生成内容可能包含不安全或敏感内容，请您避免输入易产生敏感内容的提示语，感谢您的配合。"}}
-        top_p=top_p,
+    try:
-    )
+        response = client.chat.completions.create(
            model='glm-4',
            messages=messages,
            top_p=top_p,
        )
    except:
        response = client.chat.completions.create(
            model='glm-4',
            messages=messages,
            top_p=top_p,
        )
    return response.choices[0].message.content
--- a/scripts/qa_generation/Clean_QA.md
+++ b/scripts/qa_generation/Clean_QA.md
@ -1,11 +0,0 @@
 # 清洗 QA 对
 调用qwen去判断当前QA对是否属于心理学范畴，去除非心理学范畴的 QA 对
 ## Step 1
 1. 准备好需要清洗的 QA 对数据
 2. 将该数据放进 model 同级 data 文件夹下
 3. 根据文件夹名去修改 config/config.py 中的 judge_dir。我个人没有对文件名进行更改，所以我的judge_dir是 judge_dir = os.path.join(data_dir, '数据整合')
 ## Step 2
 1. 运行QA_clean.py即可
 2. 清洗完的 QA 对会以 jsonl 的格式存在 data/cleaned 下
--- a/scripts/qa_generation/README.md
+++ b/scripts/qa_generation/README.md
@ -93,3 +93,34 @@
 ## **步骤四：清洗QA对**
 - 清洗目的
  - 提高提取的QA数据质量，清理掉与心理学无关的QA对
 - 清洗方法
  - 使用Prompt方法，驱动LLM对给出的QA对进行判断
  - **参考Prompt**
  - ```markdown
    你是一名经验丰富的心理咨询师，熟悉心理学相关知识。根据我提供的 QA 对，来判断这个 QA 对是否属于心理学范畴。
    标准如下：
    - 若当前 QA 对属于心理学范畴，则返回1
    - 若当前 QA 对不属于心理学范畴，则返回0
    以下是给定的心理学 QA 对内容：
    ```
 - 清洗工具
  - 配置`config/config.py` 中的 `DASHSCOPE_API_KEY`,`API_KEY`获取方法见步骤三
  - 使用提供的清洗脚本[QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
 - 使用方法
  - 准备好需要清洗的 QA 对数据
  - 将该数据放进 model 同级 data 文件夹下
  - 根据文件夹名去修改 `config/config.py` 中的 `judge_dir`。
  - 如存储数据的文件名为`xxx`，则`judge_dir`是 `judge_dir = os.path.join(data_dir, 'xxx')`
  - 清洗完的 QA 对会以 `jsonl` 的格式存在 `data/cleaned` 下
--- a/scripts/qa_generation/README_EN.md
+++ b/scripts/qa_generation/README_EN.md
@ -93,3 +93,40 @@ Using books specialized in psychology to build QA knowledge pairs for RAG to pro
 ## **Step 4: Cleaning of QA pairs**
 - Purpose of cleaning
  - Improve the quality of extracted QA data and clean out QA pairs that are not relevant to psychology
 - Cleaning Methods
  - Use the Prompt method to drive the LLM to make a judgment on the given QA pairs
  - **Reference to Prompt**
  - ```markdown
    You are an experienced counselor and are familiar with psychology. Based on the QA pair I have provided, determine if this QA pair is psychological in nature.
    The criteria are as follows:
    - If the current QA pair belongs to the category of psychology, then return 1
    - If the current QA pair does not belong to the category of psychology, then return 0.
    The following is the content of the given psychology QA pair:
    ```
 - Cleaning Tools
  - Configure `DASHSCOPE_API_KEY` in `config/config.py`, see step 3 for how to get `API_KEY`.
  - Use the provided cleaning script [QA_Clear](https://github.com/SmartFlowAI/EmoLLM/blob/main/scripts/qa_generation/QA_clean.py)
 - How to use
  - Prepare the QA pair data to be cleaned
  - Put the data into the data folder of the same level as the model.
  - Modify `judge_dir` in `config/config.py` according to the folder name.
  - If the file name of the stored data is `xxx`, then `judge_dir` is `judge_dir = os.path.join(data_dir, 'xxx')`.
  - The cleaned QA pairs are stored as `jsonl` under `data/cleaned`.