update three merge_json*.py files and corresponding tutorial in CN and EN
update three merge_json*.py files and corresponding tutorial in CN and EN
This commit is contained in:
		
							parent
							
								
									3cadeadf09
								
							
						
					
					
						commit
						c16761e289
					
				
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -7,6 +7,8 @@ pdf/
 | 
			
		||||
.idea/
 | 
			
		||||
 | 
			
		||||
*.jsonl
 | 
			
		||||
*.json
 | 
			
		||||
*.txt
 | 
			
		||||
# ./generate_data/*.josnl
 | 
			
		||||
# ./generate_data/*/*/*.josnl
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										40
									
								
								generate_data/merge_json.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								generate_data/merge_json.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,40 @@
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_merge_json(data_lis, file_path):
 | 
			
		||||
    import json
 | 
			
		||||
 | 
			
		||||
    with open(file_path, 'wt', encoding='utf-8') as file:
 | 
			
		||||
        json.dump(data_lis, file, ensure_ascii=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_all_file_paths(folder_path):
 | 
			
		||||
    # 确保传入的是一个目录
 | 
			
		||||
    if not os.path.isdir(folder_path):
 | 
			
		||||
        raise ValueError(f"{folder_path} is not a valid directory")
 | 
			
		||||
 | 
			
		||||
    # 获取文件夹下所有文件的路径
 | 
			
		||||
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
 | 
			
		||||
        folder_path) if os.path.isfile(os.path.join(folder_path, file))]
 | 
			
		||||
    return file_paths
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    conversion_lis = []
 | 
			
		||||
 | 
			
		||||
    for path in get_all_file_paths(r'data\res-aiwei'):
 | 
			
		||||
        print(path)
 | 
			
		||||
 | 
			
		||||
        with open(path, 'rt', encoding='utf-8') as file:
 | 
			
		||||
            for line in file:
 | 
			
		||||
                # 移除行尾的换行符
 | 
			
		||||
                line = line.rstrip('\n')
 | 
			
		||||
                # 解析JSON
 | 
			
		||||
                try:
 | 
			
		||||
                    data = json.loads(line)
 | 
			
		||||
                    conversion_lis.append(data)
 | 
			
		||||
                except json.JSONDecodeError as e:
 | 
			
		||||
                    print(f"Error decoding JSON: {e}")
 | 
			
		||||
        save_merge_json(data_lis=conversion_lis,
 | 
			
		||||
                        file_path=r'.\merge.json')
 | 
			
		||||
							
								
								
									
										62
									
								
								generate_data/merge_jsonl.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								generate_data/merge_jsonl.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,62 @@
 | 
			
		||||
# -*- coding: utf-8 -*-  
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_merge_json(data_lis, file_path):
 | 
			
		||||
    with open(file_path, 'wt', encoding='utf-8') as file:
 | 
			
		||||
        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_all_file_paths(folder_path, file_type='.jsonl'):
 | 
			
		||||
    # 确保传入的是一个目录
 | 
			
		||||
    if not os.path.isdir(folder_path):
 | 
			
		||||
        raise ValueError(f"{folder_path} is not a valid directory")
 | 
			
		||||
 | 
			
		||||
    # 获取文件夹下所有文件的路径
 | 
			
		||||
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
 | 
			
		||||
        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
 | 
			
		||||
    return file_paths
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    conversion_lis = []
 | 
			
		||||
    
 | 
			
		||||
    folder_path = r'./'  # python merge_jsonl.py > curr.txt
 | 
			
		||||
    
 | 
			
		||||
    merge_path = folder_path.split('/')[-1]
 | 
			
		||||
    try:
 | 
			
		||||
        merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
 | 
			
		||||
    except:
 | 
			
		||||
        merge_last_path = '' 
 | 
			
		||||
    print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    for path in get_all_file_paths(folder_path):
 | 
			
		||||
        print(path.encode("utf-8"))
 | 
			
		||||
 | 
			
		||||
        with open(path, 'rt', encoding='utf-8') as file:
 | 
			
		||||
            for line in file:
 | 
			
		||||
                # # 移除行尾的换行符
 | 
			
		||||
                # if line == '\n':
 | 
			
		||||
                #     line = line.rstrip('\n')
 | 
			
		||||
                line = line.rstrip('\n')
 | 
			
		||||
                # 解析JSON
 | 
			
		||||
                try:
 | 
			
		||||
                    data = json.loads(line)
 | 
			
		||||
                    conversion_lis.append(data)
 | 
			
		||||
                    # conversion_lis.append('\n')
 | 
			
		||||
                except json.JSONDecodeError as e:
 | 
			
		||||
                    print(f"Error decoding JSON: {e}")
 | 
			
		||||
                    
 | 
			
		||||
    if merge_last_path!='':
 | 
			
		||||
        save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
 | 
			
		||||
    elif merge_path!='':
 | 
			
		||||
        save_merge_json_path = rf'./{merge_path}_merge.json'
 | 
			
		||||
    else:
 | 
			
		||||
        save_merge_json_path = rf'./curr_merge.json'
 | 
			
		||||
                    
 | 
			
		||||
    save_merge_json(data_lis=conversion_lis,
 | 
			
		||||
                    file_path=save_merge_json_path)
 | 
			
		||||
    print(len(conversion_lis),save_merge_json_path)
 | 
			
		||||
							
								
								
									
										77
									
								
								generate_data/merge_jsonl_r.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								generate_data/merge_jsonl_r.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,77 @@
 | 
			
		||||
# -*- coding: utf-8 -*-  
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_merge_json(data_lis, file_path):
 | 
			
		||||
    with open(file_path, 'wt', encoding='utf-8') as file:
 | 
			
		||||
        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_all_file_paths(folder_path, file_type='.jsonl'):
 | 
			
		||||
    # 确保传入的是一个目录
 | 
			
		||||
    if not os.path.isdir(folder_path):
 | 
			
		||||
        raise ValueError(f"{folder_path} is not a valid directory")
 | 
			
		||||
 | 
			
		||||
    # 获取文件夹下所有文件的路径
 | 
			
		||||
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
 | 
			
		||||
        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
 | 
			
		||||
    return file_paths
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    
 | 
			
		||||
    data_ai = 'qwen'  # python merge_jsonl_r.py > qwen.txt
 | 
			
		||||
    # data_ai = 'zhipuai'  # python merge_jsonl_r.py > zhipuai.txt
 | 
			
		||||
    root_dir  = rf'./{data_ai}/'
 | 
			
		||||
    
 | 
			
		||||
    save_final_merge_json_path = f'{data_ai}_final_merge.json'
 | 
			
		||||
 | 
			
		||||
    subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
 | 
			
		||||
 | 
			
		||||
    final_list = []
 | 
			
		||||
    for folder_path in subfolders:
 | 
			
		||||
        conversion_lis = []
 | 
			
		||||
        merge_path = folder_path.split('/')[-1]
 | 
			
		||||
        try:
 | 
			
		||||
            merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
 | 
			
		||||
        except:
 | 
			
		||||
            merge_last_path = '' 
 | 
			
		||||
        print(f'merge_path={merge_path},merge_last_path={merge_last_path}'.encode("utf-8"))
 | 
			
		||||
            
 | 
			
		||||
 | 
			
		||||
        for path in get_all_file_paths(folder_path):
 | 
			
		||||
            print(path.encode("utf-8"))
 | 
			
		||||
 | 
			
		||||
            with open(path, 'rt', encoding='utf-8') as file:
 | 
			
		||||
                for line in file:
 | 
			
		||||
                    # # 移除行尾的换行符
 | 
			
		||||
                    # if line == '\n':
 | 
			
		||||
                    #     line = line.rstrip('\n')
 | 
			
		||||
                    line = line.rstrip('\n')
 | 
			
		||||
                    # 解析JSON
 | 
			
		||||
                    try:
 | 
			
		||||
                        data = json.loads(line)
 | 
			
		||||
                        conversion_lis.append(data)
 | 
			
		||||
                        # conversion_lis.append('\n')
 | 
			
		||||
                    except json.JSONDecodeError as e:
 | 
			
		||||
                        print(f"Error decoding JSON: {e}")
 | 
			
		||||
                        
 | 
			
		||||
            if merge_last_path!='':
 | 
			
		||||
                save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
 | 
			
		||||
            elif merge_path!='':
 | 
			
		||||
                save_merge_json_path = rf'./{merge_path}_merge.json'
 | 
			
		||||
            else:
 | 
			
		||||
                save_merge_json_path = rf'./curr_merge.json'
 | 
			
		||||
                            
 | 
			
		||||
            save_merge_json(data_lis=conversion_lis,
 | 
			
		||||
                            file_path=save_merge_json_path)
 | 
			
		||||
        
 | 
			
		||||
        final_list = final_list+conversion_lis
 | 
			
		||||
        print(f'{len(conversion_lis)},{len(final_list)},{save_merge_json_path}'.encode("utf-8"))
 | 
			
		||||
        
 | 
			
		||||
    save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
 | 
			
		||||
    print(len(conversion_lis),save_final_merge_json_path.encode("utf-8"))
 | 
			
		||||
        
 | 
			
		||||
        
 | 
			
		||||
@ -22,7 +22,7 @@
 | 
			
		||||
 | 
			
		||||
## **三、实践步骤**
 | 
			
		||||
 | 
			
		||||
1. **初始化**
 | 
			
		||||
### 1. **初始化**
 | 
			
		||||
 | 
			
		||||
* 安装所需的软件和库
 | 
			
		||||
 | 
			
		||||
@ -34,49 +34,62 @@
 | 
			
		||||
 | 
			
		||||
  可参见 `config.yml`均有注释
 | 
			
		||||
 | 
			
		||||
2. **模型选择与配置**
 | 
			
		||||
### 2. **模型选择与配置**
 | 
			
		||||
 | 
			
		||||
* 根据需求选择适合的模型
 | 
			
		||||
  为了使大家都能够玩上大模型,我们选用InterLLM2-7B作为我们的基线模型(消费级显卡也可部署微调的哦)
 | 
			
		||||
* 对模型进行必要的配置和调整
 | 
			
		||||
  根据我们的数据集以及配置策略,使用XTuner进行微调
 | 
			
		||||
 | 
			
		||||
3. **数据生成**
 | 
			
		||||
### 3. **数据生成**
 | 
			
		||||
 | 
			
		||||
#### **三种改进前的数据生成方法**
 | 
			
		||||
 | 
			
		||||
* 使用通义千问大模型进行数据生成
 | 
			
		||||
  
 | 
			
		||||
  ```bash
 | 
			
		||||
```bash
 | 
			
		||||
  # 终端运行
 | 
			
		||||
  bash run_qwen.bash
 | 
			
		||||
 | 
			
		||||
  # 或者不使用终端运行
 | 
			
		||||
  python qwen_gen_data_NoBash.py
 | 
			
		||||
  ```
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
* 使用百度文心大模型进行数据生成
 | 
			
		||||
 | 
			
		||||
  ```bash
 | 
			
		||||
```bash
 | 
			
		||||
  # 终端运行
 | 
			
		||||
  python ernie_gen_data.py
 | 
			
		||||
  ```
 | 
			
		||||
 | 
			
		||||
* 使用智谱GLM大模型进行数据生成
 | 
			
		||||
  
 | 
			
		||||
  ```bash
 | 
			
		||||
  # 终端运行
 | 
			
		||||
  python zhipuai_gen_data.py
 | 
			
		||||
  ```
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
* 使用讯飞星火大模型进行数据生成
 | 
			
		||||
  
 | 
			
		||||
  ```bash
 | 
			
		||||
```bash
 | 
			
		||||
  # 终端运行
 | 
			
		||||
  python ./xinghuo/gen_data.py
 | 
			
		||||
  ```
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
1. **自我认知数据集的整合**
 | 
			
		||||
#### **改进的两种数据生成方法**
 | 
			
		||||
 | 
			
		||||
采用改进的数据生成方法生成多轮对话时,首先需要定义`ai_tool`变量,该变量表示LLM模型的名称(`qwen`或`zhipuai`)。根据`ai_tool`变量的值,创建一个`{ai_tool}`文件夹。
 | 
			
		||||
 | 
			
		||||
然后,遍历所有的`area`值,接着根据不同的`emotion`值生成多轮对话。生成的对话会每隔`save_interval`次迭代写入到`./{ai_tool}/{area}/{emotion}.jsonl`文件中。这个过程会重复执行`total_num_each_emo_area`次。
 | 
			
		||||
 | 
			
		||||
* 使用**改进的**通义千问大模型数据生成方法
 | 
			
		||||
  
 | 
			
		||||
```bash
 | 
			
		||||
   # 或者不使用bash,直接运行
 | 
			
		||||
  python qwen_gen_data_NoBash.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
* 使用**改进的**智谱GLM大模型数据生成方法
 | 
			
		||||
  
 | 
			
		||||
```bash
 | 
			
		||||
  # 终端运行
 | 
			
		||||
  python zhipuai_gen_data.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### 4. **自我认知数据集的整合**
 | 
			
		||||
 | 
			
		||||
* 自我认知数据集需要按照格式手动生成,如下格式即可
 | 
			
		||||
 | 
			
		||||
  ```json
 | 
			
		||||
  [
 | 
			
		||||
      {
 | 
			
		||||
@ -98,19 +111,27 @@
 | 
			
		||||
  ]
 | 
			
		||||
  ```
 | 
			
		||||
 | 
			
		||||
5. **数据集整合**
 | 
			
		||||
### 5. **数据集整合**
 | 
			
		||||
 | 
			
		||||
   在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。
 | 
			
		||||
#### Case 1: 使用`python ernie_gen_data.py`、`bash run_qwen.bash`或者`python ./xinghuo/gen_data.py`
 | 
			
		||||
 | 
			
		||||
* 首先使用`check.py`进行数据检查。
 | 
			
		||||
* 然后使用`merge_json.py`将所有的json整合为一个总的json文件。
 | 
			
		||||
* 首先使用`check.py`进行数据检查。在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。
 | 
			
		||||
* 然后使用`merge_json.py`将所有的json(或者使用`merge_jsonl.py`将所有的jsonl)文件整合为一个总的json文件。
 | 
			
		||||
 | 
			
		||||
6. **评估与优化**
 | 
			
		||||
#### Case 2: 使用`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py`
 | 
			
		||||
 | 
			
		||||
在这种情况下,我们需要在使用两种改进的生成方法生成多轮对话后,将`{data_ai}`文件夹下所有`{area}`子文件夹中的所有`{emotion}.jsonl`文件合并为`{data_ai}_final_merge.json`文件。
 | 
			
		||||
 | 
			
		||||
* 由于采用了改进的数据生成方法和不同的存储生成对话结构,因此我们可以免除对数据集的检查。
 | 
			
		||||
* 然后使用`merge_jsonl_r.py`将`qwen`或者`zhipuai`定义为`data_ai`变量,并将其文件夹下所有领域(`area`)下所有的jsonl文件整合为一个总的json文件并取名为`{area}_merge.json`,最终在`{data_ai}`文件夹下生成`{data_ai}_final_merge.json`。
 | 
			
		||||
* 然后我们可以手动合成`qwen_final_merge.json`和`zhipuai_final_merge.json`为`qwen_zhipuai_final_merge.json`文件了, 注意合并后的json文件夹中,最外面只有一对`[]`,中间是`{}`包裹的多轮对话。
 | 
			
		||||
 | 
			
		||||
### 6. **评估与优化**
 | 
			
		||||
 | 
			
		||||
* 使用适当的评估指标对生成的数据集进行评估
 | 
			
		||||
* 根据评估结果进行必要的优化和调整
 | 
			
		||||
 | 
			
		||||
7. **测试与部署**
 | 
			
		||||
### 7. **测试与部署**
 | 
			
		||||
 | 
			
		||||
* 使用独立测试集对训练好的模型进行评估
 | 
			
		||||
* 根据测试结果进行必要的调整和优化
 | 
			
		||||
 | 
			
		||||
@ -22,7 +22,7 @@ In order to have a better representation of our large mental models, we must hav
 | 
			
		||||
 | 
			
		||||
## **III. Practical steps**
 | 
			
		||||
 | 
			
		||||
1. **Initialize**
 | 
			
		||||
### 1. **Initialize**
 | 
			
		||||
 | 
			
		||||
* Install the required software and libraries
 | 
			
		||||
 | 
			
		||||
@ -34,7 +34,7 @@ In order to have a better representation of our large mental models, we must hav
 | 
			
		||||
 | 
			
		||||
  See `config.yml` for annotations
 | 
			
		||||
 | 
			
		||||
2. **Model selection and configuration**
 | 
			
		||||
### 2. **Model selection and configuration**
 | 
			
		||||
 | 
			
		||||
* Select the right model for your needs
 | 
			
		||||
  In order to enable everyone to play with the large model, we chose the InterLLM2-7B as our baseline model (consumer graphics cards can also be deployed fine-tuned oh).
 | 
			
		||||
@ -42,40 +42,52 @@ In order to have a better representation of our large mental models, we must hav
 | 
			
		||||
* Make necessary configurations and adjustments to the model
 | 
			
		||||
  Use XTuner for fine-tuning based on our dataset and configuration strategy.
 | 
			
		||||
 | 
			
		||||
3. **Data generation**
 | 
			
		||||
### 3. **Data generation**
 | 
			
		||||
 | 
			
		||||
* Data generation using Tongyi Qianwen
 | 
			
		||||
#### **Three original methods for data generation**
 | 
			
		||||
 | 
			
		||||
* 1.Data generation using Tongyi Qianwen 
 | 
			
		||||
  
 | 
			
		||||
  ```bash
 | 
			
		||||
```bash
 | 
			
		||||
  # Terminal operation
 | 
			
		||||
  bash run_qwen.bash
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
  # Or just use python without bash
 | 
			
		||||
  python qwen_gen_data_NoBash.py
 | 
			
		||||
  ```
 | 
			
		||||
 | 
			
		||||
* Data generation using Wenxin Yiyan
 | 
			
		||||
* 2.Data generation using Wenxin Yiyan
 | 
			
		||||
  
 | 
			
		||||
  ```bash
 | 
			
		||||
```bash
 | 
			
		||||
  # Terminal operation
 | 
			
		||||
  python ernie_gen_data.py
 | 
			
		||||
  ```
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
* Data generation using Zhipu GLM
 | 
			
		||||
* 3.Data generation using IFlystar Fire
 | 
			
		||||
  
 | 
			
		||||
  ```bash
 | 
			
		||||
  # Terminal operation
 | 
			
		||||
  python zhipuai_gen_data.py
 | 
			
		||||
  ```
 | 
			
		||||
 | 
			
		||||
* Data generation using IFlystar Fire
 | 
			
		||||
  
 | 
			
		||||
  ```bash
 | 
			
		||||
```bash
 | 
			
		||||
  # Terminal operation
 | 
			
		||||
  python ./xinghuo/gen_data.py
 | 
			
		||||
  ```
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
4. **Integration of self-cognition datasets**
 | 
			
		||||
#### **Two improved methods for data generation**
 | 
			
		||||
 | 
			
		||||
When generating multi-turn dialogues with these two improved methods, the first step is to define the value of the `ai_tool` variable, which represents the LLM model name (`qwen` or `zhipuai`). Based on the value of this `ai_tool` variable, a `{ai_tool}` folder is created. 
 | 
			
		||||
 | 
			
		||||
Then, all `area` values are traversed, followed by different `emotion` values for generating multi-turn dialogues. The generated dialogues are written to the `./{ai_tool}/{area}/{emotion}.jsonl` file every `save_interval` iterations. This process is repeated `total_num_each_emo_area` times.
 | 
			
		||||
 | 
			
		||||
* 1.Using the **improved** method for generating data with the Qwen model:
 | 
			
		||||
  
 | 
			
		||||
```bash
 | 
			
		||||
  # Alternatively, you can run it directly without using bash
 | 
			
		||||
  python qwen_gen_data_NoBash.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
* 2.Using the **improved** method for generating data with the Zhipuai GLM-4 model:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
  # Alternatively, you can run it directly without using bash
 | 
			
		||||
  python zhipuai_gen_data.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### 4. **Integration of self-cognition datasets**
 | 
			
		||||
 | 
			
		||||
* Self-cognition dataset this needs to be manually generated in accordance with the format, the following format can be
 | 
			
		||||
  
 | 
			
		||||
@ -100,16 +112,27 @@ In order to have a better representation of our large mental models, we must hav
 | 
			
		||||
  ]
 | 
			
		||||
  ```
 | 
			
		||||
 | 
			
		||||
5. **dataset integration**
 | 
			
		||||
### 5. **Dataset Integration**
 | 
			
		||||
 | 
			
		||||
Before dataset integration, we need to check whether the generated data has formatting errors, type mismatches, etc. We need check.py to check the data. Finally, merge_json.py is used to combine all the json into one overall json file.
 | 
			
		||||
#### **Case 1**: Using `python ernie_gen_data.py`, `bash run_qwen.bash`, or `python ./xinghuo/gen_data.py`
 | 
			
		||||
 | 
			
		||||
6. **Evaluation and optimization**
 | 
			
		||||
* First, use `check.py` to check the data. Before integrating the dataset, we need to check whether the generated data has format errors or type mismatches.
 | 
			
		||||
* Then, use `merge_json.py` to consolidate all json files (or use `merge_jsonl.py` to consolidate all jsonl files) into one overall json file.
 | 
			
		||||
 | 
			
		||||
#### **Case 2**: Using `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py`
 | 
			
		||||
 | 
			
		||||
In this case, we need to merge all `{emotion}.jsonl` files in all `{area}` subfolders under the `{data_ai}` folder into `{data_ai}_final_merge.json` after we use two improved generation methods to generate multi-round conversations.
 | 
			
		||||
 | 
			
		||||
* As we have adopted improved data generation methods and different storage generation dialog structures, we can avoid checking the dataset.
 | 
			
		||||
* Then, use `merge_jsonl_r.py` to define `qwen` or `zhipuai` as the `data_ai` variable, and consolidate all jsonl files in all areas (`area`) into one overall json file named `{area}_merge.json`. Finally, generate `{data_ai}_final_merge.json` in the `{data_ai}` folder.
 | 
			
		||||
* We can then manually merge `qwen_final_merge.json` and `zhipuai_final_merge.json` into `qwen_zhipuai_final_merge.json`. Note that in the merged json file, there is only one pair of `[]` on the outside, and the multi-round dialogues are wrapped in `{}`.
 | 
			
		||||
 | 
			
		||||
### 6. **Evaluation and optimization**
 | 
			
		||||
 | 
			
		||||
* Evaluate the generated dataset using appropriate evaluation metrics
 | 
			
		||||
* Make necessary optimizations and adjustments based on the evaluation results
 | 
			
		||||
 | 
			
		||||
7. **Testing and deployment**
 | 
			
		||||
### 7. **Testing and deployment**
 | 
			
		||||
 | 
			
		||||
* Evaluate the trained model using an independent test set
 | 
			
		||||
* Make necessary adjustments and optimizations based on test results
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user