From c16761e289825c631d7c54d8ba3baaaf188f2a58 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Mon, 18 Mar 2024 23:35:21 +0900 Subject: [PATCH] update three merge_json*.py files and corresponding tutorial in CN and EN update three merge_json*.py files and corresponding tutorial in CN and EN --- .gitignore | 2 + generate_data/merge_json.py | 40 ++++++++++++++++++ generate_data/merge_jsonl.py | 62 +++++++++++++++++++++++++++ generate_data/merge_jsonl_r.py | 77 ++++++++++++++++++++++++++++++++++ generate_data/tutorial.md | 73 ++++++++++++++++++++------------ generate_data/tutorial_EN.md | 77 ++++++++++++++++++++++------------ 6 files changed, 278 insertions(+), 53 deletions(-) create mode 100644 generate_data/merge_json.py create mode 100644 generate_data/merge_jsonl.py create mode 100644 generate_data/merge_jsonl_r.py diff --git a/.gitignore b/.gitignore index 2d26489..d6ca709 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ pdf/ .idea/ *.jsonl +*.json +*.txt # ./generate_data/*.josnl # ./generate_data/*/*/*.josnl diff --git a/generate_data/merge_json.py b/generate_data/merge_json.py new file mode 100644 index 0000000..714befb --- /dev/null +++ b/generate_data/merge_json.py @@ -0,0 +1,40 @@ +import json +import os + + +def save_merge_json(data_lis, file_path): + import json + + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False) + + +def get_all_file_paths(folder_path): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file))] + return file_paths + + +if __name__ == '__main__': + conversion_lis = [] + + for path in get_all_file_paths(r'data\res-aiwei'): + print(path) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # 移除行尾的换行符 + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + save_merge_json(data_lis=conversion_lis, + file_path=r'.\merge.json') diff --git a/generate_data/merge_jsonl.py b/generate_data/merge_jsonl.py new file mode 100644 index 0000000..7887ab0 --- /dev/null +++ b/generate_data/merge_jsonl.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +import json +import os + + +def save_merge_json(data_lis, file_path): + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) + + +def get_all_file_paths(folder_path, file_type='.jsonl'): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] + return file_paths + + +if __name__ == '__main__': + conversion_lis = [] + + folder_path = r'./' # python merge_jsonl.py > curr.txt + + merge_path = folder_path.split('/')[-1] + try: + merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' + except: + merge_last_path = '' + print(f'merge_path={merge_path},merge_last_path={merge_last_path}') + + + for path in get_all_file_paths(folder_path): + print(path.encode("utf-8")) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # # 移除行尾的换行符 + # if line == '\n': + # line = line.rstrip('\n') + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + # conversion_lis.append('\n') + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + + if merge_last_path!='': + save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' + elif merge_path!='': + save_merge_json_path = rf'./{merge_path}_merge.json' + else: + save_merge_json_path = rf'./curr_merge.json' + + save_merge_json(data_lis=conversion_lis, + file_path=save_merge_json_path) + print(len(conversion_lis),save_merge_json_path) diff --git a/generate_data/merge_jsonl_r.py b/generate_data/merge_jsonl_r.py new file mode 100644 index 0000000..cf4998a --- /dev/null +++ b/generate_data/merge_jsonl_r.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +import json +import os + + +def save_merge_json(data_lis, file_path): + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) + + +def get_all_file_paths(folder_path, file_type='.jsonl'): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] + return file_paths + + +if __name__ == '__main__': + + data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt + # data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt + root_dir = rf'./{data_ai}/' + + save_final_merge_json_path = f'{data_ai}_final_merge.json' + + subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))] + + final_list = [] + for folder_path in subfolders: + conversion_lis = [] + merge_path = folder_path.split('/')[-1] + try: + merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' + except: + merge_last_path = '' + print(f'merge_path={merge_path},merge_last_path={merge_last_path}'.encode("utf-8")) + + + for path in get_all_file_paths(folder_path): + print(path.encode("utf-8")) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # # 移除行尾的换行符 + # if line == '\n': + # line = line.rstrip('\n') + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + # conversion_lis.append('\n') + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + + if merge_last_path!='': + save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' + elif merge_path!='': + save_merge_json_path = rf'./{merge_path}_merge.json' + else: + save_merge_json_path = rf'./curr_merge.json' + + save_merge_json(data_lis=conversion_lis, + file_path=save_merge_json_path) + + final_list = final_list+conversion_lis + print(f'{len(conversion_lis)},{len(final_list)},{save_merge_json_path}'.encode("utf-8")) + + save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path) + print(len(conversion_lis),save_final_merge_json_path.encode("utf-8")) + + diff --git a/generate_data/tutorial.md b/generate_data/tutorial.md index 80426b4..f7af989 100644 --- a/generate_data/tutorial.md +++ b/generate_data/tutorial.md @@ -22,7 +22,7 @@ ## **三、实践步骤** -1. **初始化** +### 1. **初始化** * 安装所需的软件和库 @@ -34,49 +34,62 @@ 可参见 `config.yml`均有注释 -2. **模型选择与配置** +### 2. **模型选择与配置** * 根据需求选择适合的模型 为了使大家都能够玩上大模型,我们选用InterLLM2-7B作为我们的基线模型(消费级显卡也可部署微调的哦) * 对模型进行必要的配置和调整 根据我们的数据集以及配置策略,使用XTuner进行微调 -3. **数据生成** +### 3. **数据生成** + +#### **三种改进前的数据生成方法** * 使用通义千问大模型进行数据生成 - ```bash +```bash # 终端运行 bash run_qwen.bash - - # 或者不使用终端运行 - python qwen_gen_data_NoBash.py - ``` +``` * 使用百度文心大模型进行数据生成 - ```bash +```bash # 终端运行 python ernie_gen_data.py - ``` - -* 使用智谱GLM大模型进行数据生成 - - ```bash - # 终端运行 - python zhipuai_gen_data.py - ``` +``` * 使用讯飞星火大模型进行数据生成 - ```bash +```bash # 终端运行 python ./xinghuo/gen_data.py - ``` +``` -1. **自我认知数据集的整合** +#### **改进的两种数据生成方法** + +采用改进的数据生成方法生成多轮对话时,首先需要定义`ai_tool`变量,该变量表示LLM模型的名称(`qwen`或`zhipuai`)。根据`ai_tool`变量的值,创建一个`{ai_tool}`文件夹。 + +然后,遍历所有的`area`值,接着根据不同的`emotion`值生成多轮对话。生成的对话会每隔`save_interval`次迭代写入到`./{ai_tool}/{area}/{emotion}.jsonl`文件中。这个过程会重复执行`total_num_each_emo_area`次。 + +* 使用**改进的**通义千问大模型数据生成方法 + +```bash + # 或者不使用bash,直接运行 + python qwen_gen_data_NoBash.py +``` + +* 使用**改进的**智谱GLM大模型数据生成方法 + +```bash + # 终端运行 + python zhipuai_gen_data.py +``` + +### 4. **自我认知数据集的整合** * 自我认知数据集需要按照格式手动生成,如下格式即可 + ```json [ { @@ -98,19 +111,27 @@ ] ``` -5. **数据集整合** +### 5. **数据集整合** - 在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。 +#### Case 1: 使用`python ernie_gen_data.py`、`bash run_qwen.bash`或者`python ./xinghuo/gen_data.py` -* 首先使用`check.py`进行数据检查。 -* 然后使用`merge_json.py`将所有的json整合为一个总的json文件。 +* 首先使用`check.py`进行数据检查。在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。 +* 然后使用`merge_json.py`将所有的json(或者使用`merge_jsonl.py`将所有的jsonl)文件整合为一个总的json文件。 -6. **评估与优化** +#### Case 2: 使用`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py` + +在这种情况下,我们需要在使用两种改进的生成方法生成多轮对话后,将`{data_ai}`文件夹下所有`{area}`子文件夹中的所有`{emotion}.jsonl`文件合并为`{data_ai}_final_merge.json`文件。 + +* 由于采用了改进的数据生成方法和不同的存储生成对话结构,因此我们可以免除对数据集的检查。 +* 然后使用`merge_jsonl_r.py`将`qwen`或者`zhipuai`定义为`data_ai`变量,并将其文件夹下所有领域(`area`)下所有的jsonl文件整合为一个总的json文件并取名为`{area}_merge.json`,最终在`{data_ai}`文件夹下生成`{data_ai}_final_merge.json`。 +* 然后我们可以手动合成`qwen_final_merge.json`和`zhipuai_final_merge.json`为`qwen_zhipuai_final_merge.json`文件了, 注意合并后的json文件夹中,最外面只有一对`[]`,中间是`{}`包裹的多轮对话。 + +### 6. **评估与优化** * 使用适当的评估指标对生成的数据集进行评估 * 根据评估结果进行必要的优化和调整 -7. **测试与部署** +### 7. **测试与部署** * 使用独立测试集对训练好的模型进行评估 * 根据测试结果进行必要的调整和优化 diff --git a/generate_data/tutorial_EN.md b/generate_data/tutorial_EN.md index 25e10e2..85acf33 100644 --- a/generate_data/tutorial_EN.md +++ b/generate_data/tutorial_EN.md @@ -22,7 +22,7 @@ In order to have a better representation of our large mental models, we must hav ## **III. Practical steps** -1. **Initialize** +### 1. **Initialize** * Install the required software and libraries @@ -34,7 +34,7 @@ In order to have a better representation of our large mental models, we must hav See `config.yml` for annotations -2. **Model selection and configuration** +### 2. **Model selection and configuration** * Select the right model for your needs In order to enable everyone to play with the large model, we chose the InterLLM2-7B as our baseline model (consumer graphics cards can also be deployed fine-tuned oh). @@ -42,40 +42,52 @@ In order to have a better representation of our large mental models, we must hav * Make necessary configurations and adjustments to the model Use XTuner for fine-tuning based on our dataset and configuration strategy. -3. **Data generation** +### 3. **Data generation** -* Data generation using Tongyi Qianwen +#### **Three original methods for data generation** + +* 1.Data generation using Tongyi Qianwen - ```bash +```bash # Terminal operation bash run_qwen.bash +``` - # Or just use python without bash - python qwen_gen_data_NoBash.py - ``` - -* Data generation using Wenxin Yiyan +* 2.Data generation using Wenxin Yiyan - ```bash +```bash # Terminal operation python ernie_gen_data.py - ``` +``` -* Data generation using Zhipu GLM +* 3.Data generation using IFlystar Fire - ```bash - # Terminal operation - python zhipuai_gen_data.py - ``` - -* Data generation using IFlystar Fire - - ```bash +```bash # Terminal operation python ./xinghuo/gen_data.py - ``` +``` -4. **Integration of self-cognition datasets** +#### **Two improved methods for data generation** + +When generating multi-turn dialogues with these two improved methods, the first step is to define the value of the `ai_tool` variable, which represents the LLM model name (`qwen` or `zhipuai`). Based on the value of this `ai_tool` variable, a `{ai_tool}` folder is created. + +Then, all `area` values are traversed, followed by different `emotion` values for generating multi-turn dialogues. The generated dialogues are written to the `./{ai_tool}/{area}/{emotion}.jsonl` file every `save_interval` iterations. This process is repeated `total_num_each_emo_area` times. + +* 1.Using the **improved** method for generating data with the Qwen model: + +```bash + # Alternatively, you can run it directly without using bash + python qwen_gen_data_NoBash.py +``` + +* 2.Using the **improved** method for generating data with the Zhipuai GLM-4 model: + +```bash + # Alternatively, you can run it directly without using bash + python zhipuai_gen_data.py +``` + +### 4. **Integration of self-cognition datasets** * Self-cognition dataset this needs to be manually generated in accordance with the format, the following format can be @@ -100,16 +112,27 @@ In order to have a better representation of our large mental models, we must hav ] ``` -5. **dataset integration** +### 5. **Dataset Integration** -Before dataset integration, we need to check whether the generated data has formatting errors, type mismatches, etc. We need check.py to check the data. Finally, merge_json.py is used to combine all the json into one overall json file. +#### **Case 1**: Using `python ernie_gen_data.py`, `bash run_qwen.bash`, or `python ./xinghuo/gen_data.py` -6. **Evaluation and optimization** +* First, use `check.py` to check the data. Before integrating the dataset, we need to check whether the generated data has format errors or type mismatches. +* Then, use `merge_json.py` to consolidate all json files (or use `merge_jsonl.py` to consolidate all jsonl files) into one overall json file. + +#### **Case 2**: Using `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py` + +In this case, we need to merge all `{emotion}.jsonl` files in all `{area}` subfolders under the `{data_ai}` folder into `{data_ai}_final_merge.json` after we use two improved generation methods to generate multi-round conversations. + +* As we have adopted improved data generation methods and different storage generation dialog structures, we can avoid checking the dataset. +* Then, use `merge_jsonl_r.py` to define `qwen` or `zhipuai` as the `data_ai` variable, and consolidate all jsonl files in all areas (`area`) into one overall json file named `{area}_merge.json`. Finally, generate `{data_ai}_final_merge.json` in the `{data_ai}` folder. +* We can then manually merge `qwen_final_merge.json` and `zhipuai_final_merge.json` into `qwen_zhipuai_final_merge.json`. Note that in the merged json file, there is only one pair of `[]` on the outside, and the multi-round dialogues are wrapped in `{}`. + +### 6. **Evaluation and optimization** * Evaluate the generated dataset using appropriate evaluation metrics * Make necessary optimizations and adjustments based on the evaluation results -7. **Testing and deployment** +### 7. **Testing and deployment** * Evaluate the trained model using an independent test set * Make necessary adjustments and optimizations based on test results