处理之前根据心理学数据生成的QA数据
This commit is contained in:
parent
9829ce7a76
commit
a0f76d9c96
14
datasets/processed/Book_QA_Process.md
Executable file
14
datasets/processed/Book_QA_Process.md
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
## 一共有两个 .py 文件,分别为Book_QA_process_Step_1.py和Book_QA_process_Step_2.py
|
||||||
|
### Book_QA_process_Step_1.py
|
||||||
|
该代码是将我们生成的QA对jsonl数据转换为json格式
|
||||||
|
### Book_QA_process_Step_2.py
|
||||||
|
该代码是将第一步生成的json格式数据转化为可用于指令微调的数据格式,并添加system,即:
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{
|
||||||
|
"system": "你由EmoLLM团队打造的心理健康助手......",
|
||||||
|
"input": "Question",
|
||||||
|
"output": "Answer"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
35
datasets/processed/Book_QA_process_Step_1.py
Normal file
35
datasets/processed/Book_QA_process_Step_1.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
# 设置目录路径,这里假设你的 .jsonl 文件都在当前目录下的directory_path文件夹中
|
||||||
|
directory_path = '../初步清洗的QA数据'
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_desired_format(input_file, output_file):
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
data = [json.loads(line) for line in f]
|
||||||
|
|
||||||
|
transformed_data = []
|
||||||
|
for entry in data:
|
||||||
|
transformed_entry = {
|
||||||
|
"prompt": entry["question"],
|
||||||
|
"completion": entry["answer"]
|
||||||
|
}
|
||||||
|
transformed_data.append(transformed_entry)
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(transformed_data, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
# 遍历指定目录下的所有文件
|
||||||
|
for filename in os.listdir(directory_path):
|
||||||
|
for root, dirs, files in os.walk(directory_path):
|
||||||
|
# 遍历当前文件夹下的文件
|
||||||
|
for filename in files:
|
||||||
|
# 检查文件扩展名是否为.json
|
||||||
|
if filename.endswith('.jsonl'):
|
||||||
|
# 构建文件的完整路径
|
||||||
|
file_path = os.path.join(root, filename)
|
||||||
|
output_file = file_path.replace('.jsonl', '.json')
|
||||||
|
convert_to_desired_format(file_path, output_file)
|
||||||
|
|
32
datasets/processed/Book_QA_process_Step_2.py
Normal file
32
datasets/processed/Book_QA_process_Step_2.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 打开JSON文件并读取其内容
|
||||||
|
directory_path = '../初步清洗的QA数据'
|
||||||
|
|
||||||
|
system = "你由EmoLLM团队打造的心理健康助手,是一个研究过无数具有心理健康问题的病人与心理健康医生对话的心理专家, 在心理方面拥有广博的知识储备和丰富的研究咨询经验。请充分利用专业心理学知识,对用户提出的问题进行回答。"
|
||||||
|
|
||||||
|
format2_data = []
|
||||||
|
# 遍历指定目录下的所有文件
|
||||||
|
for filename in os.listdir(directory_path):
|
||||||
|
for root, dirs, files in os.walk(directory_path):
|
||||||
|
# 遍历当前文件夹下的文件
|
||||||
|
for filename in files:
|
||||||
|
# 检查文件扩展名是否为.json
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
# 构建文件的完整路径
|
||||||
|
file_path = os.path.join(root, filename)
|
||||||
|
with open(file_path, 'rt', encoding='utf-8') as file:
|
||||||
|
format1_data = json.load(file)
|
||||||
|
for item in format1_data:
|
||||||
|
conversation = {
|
||||||
|
"system": system,
|
||||||
|
"input": item["prompt"],
|
||||||
|
"output": item["completion"]
|
||||||
|
}
|
||||||
|
format2_data.append({"conversation": [conversation]})
|
||||||
|
|
||||||
|
|
||||||
|
with open(f'processed_Book_QA.json', 'wt', encoding='utf-8') as file:
|
||||||
|
json.dump(format2_data, file, ensure_ascii=False, indent=4)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user