Merge branch 'SmartFlowAI:main' into main

2024-04-29 11:03:52 +09:00 · 2024-04-29 11:03:52 +09:00 · 65e37c0a4d
commit 65e37c0a4d
parent 309b80b318 4e05afb8c1
4 changed files with 140 additions and 0 deletions
--- a/datasets/processed/Book_QA_Process.md
+++ b/datasets/processed/Book_QA_Process.md
@ -0,0 +1,14 @@
 ## 一共有两个 .py 文件，分别为Book_QA_process_Step_1.py和Book_QA_process_Step_2.py
 ### Book_QA_process_Step_1.py
    该代码是将我们生成的QA对jsonl数据转换为json格式
 ### Book_QA_process_Step_2.py
    该代码是将第一步生成的json格式数据转化为可用于指令微调的数据格式，并添加system，即：
    {
        "conversation": [
            {
                "system": "你由EmoLLM团队打造的心理健康助手......",
                "input": "Question",
                "output": "Answer"
            }
        ]
    }
--- a/datasets/processed/Book_QA_process_Step_1.py
+++ b/datasets/processed/Book_QA_process_Step_1.py
@ -0,0 +1,35 @@
 import os
 import json
 # 设置目录路径，这里假设你的 .jsonl 文件都在当前目录下的directory_path文件夹中
 directory_path = '../初步清洗的QA数据'
 def convert_to_desired_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    transformed_data = []
    for entry in data:
        transformed_entry = {
            "prompt": entry["question"],
            "completion": entry["answer"]
        }
        transformed_data.append(transformed_entry)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(transformed_data, f, ensure_ascii=False, indent=4)
 # 遍历指定目录下的所有文件
 for filename in os.listdir(directory_path):
    for root, dirs, files in os.walk(directory_path):
        # 遍历当前文件夹下的文件
        for filename in files:
            # 检查文件扩展名是否为.json
            if filename.endswith('.jsonl'):
                # 构建文件的完整路径
                file_path = os.path.join(root, filename)
                output_file = file_path.replace('.jsonl', '.json')
                convert_to_desired_format(file_path, output_file)
--- a/datasets/processed/Book_QA_process_Step_2.py
+++ b/datasets/processed/Book_QA_process_Step_2.py
@ -0,0 +1,32 @@
 import json
 import os
 # 打开JSON文件并读取其内容
 directory_path = '../初步清洗的QA数据'
 system = "你由EmoLLM团队打造的心理健康助手，是一个研究过无数具有心理健康问题的病人与心理健康医生对话的心理专家, 在心理方面拥有广博的知识储备和丰富的研究咨询经验。请充分利用专业心理学知识，对用户提出的问题进行回答。"
 format2_data = []
 # 遍历指定目录下的所有文件
 for filename in os.listdir(directory_path):
    for root, dirs, files in os.walk(directory_path):
        # 遍历当前文件夹下的文件
        for filename in files:
            # 检查文件扩展名是否为.json
            if filename.endswith('.json'):
                # 构建文件的完整路径
                file_path = os.path.join(root, filename)
                with open(file_path, 'rt', encoding='utf-8') as file:
                    format1_data = json.load(file)
                for item in format1_data:
                    conversation = {
                        "system": system,
                        "input": item["prompt"],
                        "output": item["completion"]
                    }
                    format2_data.append({"conversation": [conversation]})
 with open(f'processed_Book_QA.json', 'wt', encoding='utf-8') as file:
    json.dump(format2_data, file, ensure_ascii=False, indent=4)
--- a/scripts/xtuner2sharegpt.py
+++ b/scripts/xtuner2sharegpt.py
@ -0,0 +1,59 @@
 import json
 # Given JSON data in string format
 # original_json_data = """
 # [
 #     {
 #         "conversation": [
 #             {"system": "system", "input": "input", "output": "output"},
 #             {"input": "input", "output": "output"},
 #             {"input": "input", "output": "output"}
 #         ]
 #     },
 #     {
 #         "conversation": [
 #             {"system": "system", "input": "input", "output": "output"},
 #             {"input": "input", "output": "output"},
 #             {"input": "input", "output": "output"}
 #         ]
 #     }
 # ]
 # """
 # Parse the original JSON data into Python objects
 def convert_xtuner_to_sharegpt(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    # Initialize a new list to hold transformed conversations
    transformed_conversations = []
    for conversation_group in data:
        system = conversation_group["conversation"][0]["system"]
        # Extract human and GPT inputs and outputs from each conversation pair
        transformed_pairs = []
        for pair in conversation_group["conversation"]:
            # if "system" in pair:
            #     continue  # Skip the initial system entry
            transformed_pairs.append({"from": "human", "value": pair["input"]})
            transformed_pairs.append({"from": "gpt", "value": pair["output"]})
        # print(transformed_pairs)
        # Add the transformed conversation group to the result list
        transformed_conversation = {
            "conversations": transformed_pairs,
            "system": system,
        }
        transformed_conversations.append(transformed_conversation)
    # Convert the transformed Python objects back into JSON format
    with open(output_path, "w", encoding='utf-8') as output_file:
        json.dump(transformed_conversations, output_file, ensure_ascii=False, indent=4)
 if __name__ == "__main__":
    input_path = "../datasets/scientist.json"
    output_path = "../datasets/scientist_sharegpt.json"
    convert_xtuner_to_sharegpt(input_path, output_path)