Dev (#224)
This commit is contained in:
		
						commit
						4e05afb8c1
					
				
							
								
								
									
										59
									
								
								scripts/xtuner2sharegpt.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								scripts/xtuner2sharegpt.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,59 @@ | ||||
| import json | ||||
| 
 | ||||
| # Given JSON data in string format | ||||
| # original_json_data = """ | ||||
| # [ | ||||
| #     { | ||||
| #         "conversation": [ | ||||
| #             {"system": "system", "input": "input", "output": "output"}, | ||||
| #             {"input": "input", "output": "output"}, | ||||
| #             {"input": "input", "output": "output"} | ||||
| #         ] | ||||
| #     }, | ||||
| #     { | ||||
| #         "conversation": [ | ||||
| #             {"system": "system", "input": "input", "output": "output"}, | ||||
| #             {"input": "input", "output": "output"}, | ||||
| #             {"input": "input", "output": "output"} | ||||
| #         ] | ||||
| #     } | ||||
| # ] | ||||
| # """ | ||||
| 
 | ||||
| # Parse the original JSON data into Python objects | ||||
| def convert_xtuner_to_sharegpt(input_path, output_path): | ||||
|     with open(input_path, 'r', encoding='utf-8') as file: | ||||
|         data = json.load(file) | ||||
| 
 | ||||
|     # Initialize a new list to hold transformed conversations | ||||
|     transformed_conversations = [] | ||||
| 
 | ||||
|     for conversation_group in data: | ||||
|         system = conversation_group["conversation"][0]["system"] | ||||
| 
 | ||||
|         # Extract human and GPT inputs and outputs from each conversation pair | ||||
|         transformed_pairs = [] | ||||
|         for pair in conversation_group["conversation"]: | ||||
|             # if "system" in pair: | ||||
|             #     continue  # Skip the initial system entry | ||||
| 
 | ||||
|             transformed_pairs.append({"from": "human", "value": pair["input"]}) | ||||
|             transformed_pairs.append({"from": "gpt", "value": pair["output"]}) | ||||
|         # print(transformed_pairs) | ||||
|         # Add the transformed conversation group to the result list | ||||
|         transformed_conversation = { | ||||
|             "conversations": transformed_pairs, | ||||
|             "system": system, | ||||
|         } | ||||
|         transformed_conversations.append(transformed_conversation) | ||||
| 
 | ||||
|     # Convert the transformed Python objects back into JSON format | ||||
|     with open(output_path, "w", encoding='utf-8') as output_file: | ||||
|         json.dump(transformed_conversations, output_file, ensure_ascii=False, indent=4) | ||||
|          | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     input_path = "../datasets/scientist.json" | ||||
|     output_path = "../datasets/scientist_sharegpt.json" | ||||
|     convert_xtuner_to_sharegpt(input_path, output_path) | ||||
|      | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 xzw
						xzw