add dataset processing codes
1. update process.py for multi_turn_dataset(1 and 2) and data.json, data_pro.json 2. add datasets\processed\process_single_turn_conversation_construction.py for single-turn dataset (1 and 2) 3. add datasets\processed\process_merge.py for these 6 updated dataset in datasets\processed\
This commit is contained in:
parent
ce2cb5156c
commit
085a01eafa
@ -1,12 +1,25 @@
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
# 打开JSON文件并读取其内容
|
# 打开JSON文件并读取其内容
|
||||||
with open('/root/Emollm/datasets/multi_turn_dataset_2.json', 'rt', encoding='utf-8') as file:
|
|
||||||
|
# file_name = 'multi_turn_dataset_1.json'
|
||||||
|
# file_name = 'multi_turn_dataset_2.json'
|
||||||
|
# file_name = 'data_pro.json'
|
||||||
|
file_name = 'data.json'
|
||||||
|
|
||||||
|
with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
n = 0
|
n = 0
|
||||||
for i in data:
|
for i in data:
|
||||||
i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。"
|
|
||||||
|
|
||||||
with open('output2.json', 'wt', encoding='utf-8') as file:
|
try:
|
||||||
|
i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。"
|
||||||
|
except:
|
||||||
|
print(n,i) # 4 empty lines in data.json 425 483 742 1120
|
||||||
|
n+=1
|
||||||
|
|
||||||
|
with open(f'processed_{file_name}', 'wt', encoding='utf-8') as file:
|
||||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print(data[0])
|
34
datasets/processed/process_merge.py
Normal file
34
datasets/processed/process_merge.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
# 设置目录路径,这里假设你的JSON文件都在当前目录下的json_files文件夹中
|
||||||
|
directory_path = './'
|
||||||
|
|
||||||
|
# 初始化一个空列表,用于存储所有JSON文件的数据
|
||||||
|
combined_list = []
|
||||||
|
|
||||||
|
# 遍历指定目录下的所有文件
|
||||||
|
for filename in os.listdir(directory_path):
|
||||||
|
# 检查文件扩展名是否为.json
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
# 构建文件的完整路径
|
||||||
|
file_path = os.path.join(directory_path, filename)
|
||||||
|
|
||||||
|
# 打开并读取JSON文件
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as json_file:
|
||||||
|
# 加载JSON文件的内容
|
||||||
|
data = json.load(json_file)
|
||||||
|
|
||||||
|
# 将读取到的数据添加到combined_list中
|
||||||
|
# 假设每个JSON文件包含的是一个列表,如果不是,可以根据实际情况调整
|
||||||
|
if isinstance(data, list):
|
||||||
|
combined_list.extend(data)
|
||||||
|
else:
|
||||||
|
combined_list.append(data)
|
||||||
|
|
||||||
|
# 打印合并后的列表
|
||||||
|
# print(combined_list)
|
||||||
|
|
||||||
|
# 如果需要,可以将合并后的列表保存到一个新的JSON文件中
|
||||||
|
with open('combined_data.json', 'w', encoding='utf-8') as combined_json_file:
|
||||||
|
json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4)
|
@ -0,0 +1,31 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
# 打开JSON文件并读取其内容
|
||||||
|
# file_name = 'single_turn_dataset_1.json'
|
||||||
|
file_name = 'single_turn_dataset_2.json'
|
||||||
|
with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
|
||||||
|
format1_data = json.load(file)
|
||||||
|
|
||||||
|
# n = 0
|
||||||
|
# for i in data:
|
||||||
|
# i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。"
|
||||||
|
|
||||||
|
system = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。"
|
||||||
|
|
||||||
|
# 转换为格式2的数据
|
||||||
|
format2_data = []
|
||||||
|
for item in format1_data:
|
||||||
|
conversation = {
|
||||||
|
"system": system,
|
||||||
|
"input": item["prompt"],
|
||||||
|
"output": item["completion"]
|
||||||
|
}
|
||||||
|
format2_data.append({"conversation": [conversation]})
|
||||||
|
|
||||||
|
# 将转换后的数据转换为JSON格式
|
||||||
|
|
||||||
|
|
||||||
|
with open(f'./processed_{file_name}', 'wt', encoding='utf-8') as file:
|
||||||
|
json.dump(format2_data, file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print(format2_data[0])
|
Loading…
Reference in New Issue
Block a user