From 14bce17682f61af059f8d23e881946890d277d39 Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Fri, 26 Apr 2024 11:40:29 +0800 Subject: [PATCH] add xtuner2sharegpt.py --- scripts/xtuner2sharegpt.py | 59 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 scripts/xtuner2sharegpt.py diff --git a/scripts/xtuner2sharegpt.py b/scripts/xtuner2sharegpt.py new file mode 100644 index 0000000..00e790c --- /dev/null +++ b/scripts/xtuner2sharegpt.py @@ -0,0 +1,59 @@ +import json + +# Given JSON data in string format +# original_json_data = """ +# [ +# { +# "conversation": [ +# {"system": "system", "input": "input", "output": "output"}, +# {"input": "input", "output": "output"}, +# {"input": "input", "output": "output"} +# ] +# }, +# { +# "conversation": [ +# {"system": "system", "input": "input", "output": "output"}, +# {"input": "input", "output": "output"}, +# {"input": "input", "output": "output"} +# ] +# } +# ] +# """ + +# Parse the original JSON data into Python objects +def convert_xtuner_to_sharegpt(input_path, output_path): + with open(input_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # Initialize a new list to hold transformed conversations + transformed_conversations = [] + + for conversation_group in data: + system = conversation_group["conversation"][0]["system"] + + # Extract human and GPT inputs and outputs from each conversation pair + transformed_pairs = [] + for pair in conversation_group["conversation"]: + # if "system" in pair: + # continue # Skip the initial system entry + + transformed_pairs.append({"from": "human", "value": pair["input"]}) + transformed_pairs.append({"from": "gpt", "value": pair["output"]}) + # print(transformed_pairs) + # Add the transformed conversation group to the result list + transformed_conversation = { + "conversations": transformed_pairs, + "system": system, + } + transformed_conversations.append(transformed_conversation) + + # Convert the transformed Python objects back into JSON format + with open(output_path, "w", encoding='utf-8') as output_file: + json.dump(transformed_conversations, output_file, ensure_ascii=False, indent=4) + + +if __name__ == "__main__": + input_path = "../datasets/scientist.json" + output_path = "../datasets/scientist_sharegpt.json" + convert_xtuner_to_sharegpt(input_path, output_path) + \ No newline at end of file