处理之前根据心理学数据生成的QA数据

2024-04-25 13:22:48 +08:00 · 2024-04-25 13:22:48 +08:00 · a0f76d9c96
commit a0f76d9c96
parent 9829ce7a76
3 changed files with 81 additions and 0 deletions
--- a/datasets/processed/Book_QA_Process.md
+++ b/datasets/processed/Book_QA_Process.md
@ -0,0 +1,14 @@
 ## 一共有两个 .py 文件，分别为Book_QA_process_Step_1.py和Book_QA_process_Step_2.py
 ### Book_QA_process_Step_1.py
    该代码是将我们生成的QA对jsonl数据转换为json格式
 ### Book_QA_process_Step_2.py
    该代码是将第一步生成的json格式数据转化为可用于指令微调的数据格式，并添加system，即：
    {
        "conversation": [
            {
                "system": "你由EmoLLM团队打造的心理健康助手......",
                "input": "Question",
                "output": "Answer"
            }
        ]
    }
--- a/datasets/processed/Book_QA_process_Step_1.py
+++ b/datasets/processed/Book_QA_process_Step_1.py
@ -0,0 +1,35 @@
 import os
 import json
 # 设置目录路径，这里假设你的 .jsonl 文件都在当前目录下的directory_path文件夹中
 directory_path = '../初步清洗的QA数据'
 def convert_to_desired_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    transformed_data = []
    for entry in data:
        transformed_entry = {
            "prompt": entry["question"],
            "completion": entry["answer"]
        }
        transformed_data.append(transformed_entry)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(transformed_data, f, ensure_ascii=False, indent=4)
 # 遍历指定目录下的所有文件
 for filename in os.listdir(directory_path):
    for root, dirs, files in os.walk(directory_path):
        # 遍历当前文件夹下的文件
        for filename in files:
            # 检查文件扩展名是否为.json
            if filename.endswith('.jsonl'):
                # 构建文件的完整路径
                file_path = os.path.join(root, filename)
                output_file = file_path.replace('.jsonl', '.json')
                convert_to_desired_format(file_path, output_file)
--- a/datasets/processed/Book_QA_process_Step_2.py
+++ b/datasets/processed/Book_QA_process_Step_2.py
@ -0,0 +1,32 @@
 import json
 import os
 # 打开JSON文件并读取其内容
 directory_path = '../初步清洗的QA数据'
 system = "你由EmoLLM团队打造的心理健康助手，是一个研究过无数具有心理健康问题的病人与心理健康医生对话的心理专家, 在心理方面拥有广博的知识储备和丰富的研究咨询经验。请充分利用专业心理学知识，对用户提出的问题进行回答。"
 format2_data = []
 # 遍历指定目录下的所有文件
 for filename in os.listdir(directory_path):
    for root, dirs, files in os.walk(directory_path):
        # 遍历当前文件夹下的文件
        for filename in files:
            # 检查文件扩展名是否为.json
            if filename.endswith('.json'):
                # 构建文件的完整路径
                file_path = os.path.join(root, filename)
                with open(file_path, 'rt', encoding='utf-8') as file:
                    format1_data = json.load(file)
                for item in format1_data:
                    conversation = {
                        "system": system,
                        "input": item["prompt"],
                        "output": item["completion"]
                    }
                    format2_data.append({"conversation": [conversation]})
 with open(f'processed_Book_QA.json', 'wt', encoding='utf-8') as file:
    json.dump(format2_data, file, ensure_ascii=False, indent=4)