diff --git a/datasets/processed/process.py b/datasets/processed/process.py index 46fda23..3d34cb3 100644 --- a/datasets/processed/process.py +++ b/datasets/processed/process.py @@ -1,12 +1,25 @@ import json # 打开JSON文件并读取其内容 -with open('/root/Emollm/datasets/multi_turn_dataset_2.json', 'rt', encoding='utf-8') as file: + +# file_name = 'multi_turn_dataset_1.json' +# file_name = 'multi_turn_dataset_2.json' +# file_name = 'data_pro.json' +file_name = 'data.json' + +with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file: data = json.load(file) n = 0 for i in data: - i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + + try: + i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + except: + print(n,i) # 4 empty lines in data.json 425 483 742 1120 + n+=1 -with open('output2.json', 'wt', encoding='utf-8') as file: +with open(f'processed_{file_name}', 'wt', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) + +print(data[0]) \ No newline at end of file diff --git a/datasets/processed/process_merge.py b/datasets/processed/process_merge.py new file mode 100644 index 0000000..6db4f92 --- /dev/null +++ b/datasets/processed/process_merge.py @@ -0,0 +1,34 @@ +import os +import json + +# 设置目录路径,这里假设你的JSON文件都在当前目录下的json_files文件夹中 +directory_path = './' + +# 初始化一个空列表,用于存储所有JSON文件的数据 +combined_list = [] + +# 遍历指定目录下的所有文件 +for filename in os.listdir(directory_path): + # 检查文件扩展名是否为.json + if filename.endswith('.json'): + # 构建文件的完整路径 + file_path = os.path.join(directory_path, filename) + + # 打开并读取JSON文件 + with open(file_path, 'r', encoding='utf-8') as json_file: + # 加载JSON文件的内容 + data = json.load(json_file) + + # 将读取到的数据添加到combined_list中 + # 假设每个JSON文件包含的是一个列表,如果不是,可以根据实际情况调整 + if isinstance(data, list): + combined_list.extend(data) + else: + combined_list.append(data) + +# 打印合并后的列表 +# print(combined_list) + +# 如果需要,可以将合并后的列表保存到一个新的JSON文件中 +with open('combined_data.json', 'w', encoding='utf-8') as combined_json_file: + json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4) \ No newline at end of file diff --git a/datasets/processed/process_single_turn_conversation_construction.py b/datasets/processed/process_single_turn_conversation_construction.py new file mode 100644 index 0000000..5c419ab --- /dev/null +++ b/datasets/processed/process_single_turn_conversation_construction.py @@ -0,0 +1,31 @@ +import json + +# 打开JSON文件并读取其内容 +# file_name = 'single_turn_dataset_1.json' +file_name = 'single_turn_dataset_2.json' +with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file: + format1_data = json.load(file) + +# n = 0 +# for i in data: +# i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + +system = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + +# 转换为格式2的数据 +format2_data = [] +for item in format1_data: + conversation = { + "system": system, + "input": item["prompt"], + "output": item["completion"] + } + format2_data.append({"conversation": [conversation]}) + +# 将转换后的数据转换为JSON格式 + + +with open(f'./processed_{file_name}', 'wt', encoding='utf-8') as file: + json.dump(format2_data, file, ensure_ascii=False, indent=4) + +print(format2_data[0]) \ No newline at end of file