diff --git a/datasets/processed/Book_QA_Process.md b/datasets/processed/Book_QA_Process.md new file mode 100755 index 0000000..7ca8c1c --- /dev/null +++ b/datasets/processed/Book_QA_Process.md @@ -0,0 +1,14 @@ +## 一共有两个 .py 文件,分别为Book_QA_process_Step_1.py和Book_QA_process_Step_2.py +### Book_QA_process_Step_1.py + 该代码是将我们生成的QA对jsonl数据转换为json格式 +### Book_QA_process_Step_2.py + 该代码是将第一步生成的json格式数据转化为可用于指令微调的数据格式,并添加system,即: + { + "conversation": [ + { + "system": "你由EmoLLM团队打造的心理健康助手......", + "input": "Question", + "output": "Answer" + } + ] + } \ No newline at end of file diff --git a/datasets/processed/Book_QA_process_Step_1.py b/datasets/processed/Book_QA_process_Step_1.py new file mode 100644 index 0000000..e5ae609 --- /dev/null +++ b/datasets/processed/Book_QA_process_Step_1.py @@ -0,0 +1,35 @@ +import os +import json + +# 设置目录路径,这里假设你的 .jsonl 文件都在当前目录下的directory_path文件夹中 +directory_path = '../初步清洗的QA数据' + + +def convert_to_desired_format(input_file, output_file): + with open(input_file, 'r', encoding='utf-8') as f: + data = [json.loads(line) for line in f] + + transformed_data = [] + for entry in data: + transformed_entry = { + "prompt": entry["question"], + "completion": entry["answer"] + } + transformed_data.append(transformed_entry) + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(transformed_data, f, ensure_ascii=False, indent=4) + + +# 遍历指定目录下的所有文件 +for filename in os.listdir(directory_path): + for root, dirs, files in os.walk(directory_path): + # 遍历当前文件夹下的文件 + for filename in files: + # 检查文件扩展名是否为.json + if filename.endswith('.jsonl'): + # 构建文件的完整路径 + file_path = os.path.join(root, filename) + output_file = file_path.replace('.jsonl', '.json') + convert_to_desired_format(file_path, output_file) + diff --git a/datasets/processed/Book_QA_process_Step_2.py b/datasets/processed/Book_QA_process_Step_2.py new file mode 100644 index 0000000..ff59371 --- /dev/null +++ b/datasets/processed/Book_QA_process_Step_2.py @@ -0,0 +1,32 @@ +import json +import os + +# 打开JSON文件并读取其内容 +directory_path = '../初步清洗的QA数据' + +system = "你由EmoLLM团队打造的心理健康助手,是一个研究过无数具有心理健康问题的病人与心理健康医生对话的心理专家, 在心理方面拥有广博的知识储备和丰富的研究咨询经验。请充分利用专业心理学知识,对用户提出的问题进行回答。" + +format2_data = [] +# 遍历指定目录下的所有文件 +for filename in os.listdir(directory_path): + for root, dirs, files in os.walk(directory_path): + # 遍历当前文件夹下的文件 + for filename in files: + # 检查文件扩展名是否为.json + if filename.endswith('.json'): + # 构建文件的完整路径 + file_path = os.path.join(root, filename) + with open(file_path, 'rt', encoding='utf-8') as file: + format1_data = json.load(file) + for item in format1_data: + conversation = { + "system": system, + "input": item["prompt"], + "output": item["completion"] + } + format2_data.append({"conversation": [conversation]}) + + +with open(f'processed_Book_QA.json', 'wt', encoding='utf-8') as file: + json.dump(format2_data, file, ensure_ascii=False, indent=4) + diff --git a/scripts/xtuner2sharegpt.py b/scripts/xtuner2sharegpt.py new file mode 100644 index 0000000..00e790c --- /dev/null +++ b/scripts/xtuner2sharegpt.py @@ -0,0 +1,59 @@ +import json + +# Given JSON data in string format +# original_json_data = """ +# [ +# { +# "conversation": [ +# {"system": "system", "input": "input", "output": "output"}, +# {"input": "input", "output": "output"}, +# {"input": "input", "output": "output"} +# ] +# }, +# { +# "conversation": [ +# {"system": "system", "input": "input", "output": "output"}, +# {"input": "input", "output": "output"}, +# {"input": "input", "output": "output"} +# ] +# } +# ] +# """ + +# Parse the original JSON data into Python objects +def convert_xtuner_to_sharegpt(input_path, output_path): + with open(input_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # Initialize a new list to hold transformed conversations + transformed_conversations = [] + + for conversation_group in data: + system = conversation_group["conversation"][0]["system"] + + # Extract human and GPT inputs and outputs from each conversation pair + transformed_pairs = [] + for pair in conversation_group["conversation"]: + # if "system" in pair: + # continue # Skip the initial system entry + + transformed_pairs.append({"from": "human", "value": pair["input"]}) + transformed_pairs.append({"from": "gpt", "value": pair["output"]}) + # print(transformed_pairs) + # Add the transformed conversation group to the result list + transformed_conversation = { + "conversations": transformed_pairs, + "system": system, + } + transformed_conversations.append(transformed_conversation) + + # Convert the transformed Python objects back into JSON format + with open(output_path, "w", encoding='utf-8') as output_file: + json.dump(transformed_conversations, output_file, ensure_ascii=False, indent=4) + + +if __name__ == "__main__": + input_path = "../datasets/scientist.json" + output_path = "../datasets/scientist_sharegpt.json" + convert_xtuner_to_sharegpt(input_path, output_path) + \ No newline at end of file