add dataset processing codes

1. update process.py for multi_turn_dataset(1 and 2) and data.json, data_pro.json 2. add datasets\processed\process_single_turn_conversation_construction.py for single-turn dataset (1 and 2) 3. add datasets\processed\process_merge.py for these 6 updated dataset in datasets\processed\
2024-03-21 16:01:54 +09:00 · 2024-03-21 16:01:54 +09:00 · 085a01eafa
commit 085a01eafa
parent ce2cb5156c
3 changed files with 81 additions and 3 deletions
--- a/datasets/processed/process.py
+++ b/datasets/processed/process.py
@ -1,12 +1,25 @@
 import json

 # 打开JSON文件并读取其内容
-with open('/root/Emollm/datasets/multi_turn_dataset_2.json', 'rt', encoding='utf-8') as file:
+
+# file_name = 'multi_turn_dataset_1.json' 
+# file_name = 'multi_turn_dataset_2.json' 
+# file_name = 'data_pro.json' 
+file_name = 'data.json' 
+
+with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
    data = json.load(file)

 n = 0
 for i in data:
-    i['conversation'][0]['system'] = "你是心理健康助手EmoLLM，由EmoLLM团队打造。你旨在通过专业心理咨询，协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术，一步步帮助来访者解决心理问题。"
+    
+    try:
+        i['conversation'][0]['system'] = "你是心理健康助手EmoLLM，由EmoLLM团队打造。你旨在通过专业心理咨询，协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术，一步步帮助来访者解决心理问题。"
+    except:
+        print(n,i)   # 4 empty lines in data.json 425 483 742 1120 
+    n+=1

-with open('output2.json', 'wt', encoding='utf-8') as file:
+with open(f'processed_{file_name}', 'wt', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)
+
+print(data[0])
--- a/datasets/processed/process_merge.py
+++ b/datasets/processed/process_merge.py
@ -0,0 +1,34 @@
+import os
+import json
+
+# 设置目录路径，这里假设你的JSON文件都在当前目录下的json_files文件夹中
+directory_path = './'
+
+# 初始化一个空列表，用于存储所有JSON文件的数据
+combined_list = []
+
+# 遍历指定目录下的所有文件
+for filename in os.listdir(directory_path):
+    # 检查文件扩展名是否为.json
+    if filename.endswith('.json'):
+        # 构建文件的完整路径
+        file_path = os.path.join(directory_path, filename)
+        
+        # 打开并读取JSON文件
+        with open(file_path, 'r', encoding='utf-8') as json_file:
+            # 加载JSON文件的内容
+            data = json.load(json_file)
+            
+            # 将读取到的数据添加到combined_list中
+            # 假设每个JSON文件包含的是一个列表，如果不是，可以根据实际情况调整
+            if isinstance(data, list):
+                combined_list.extend(data)
+            else:
+                combined_list.append(data)
+
+# 打印合并后的列表
+# print(combined_list)
+
+# 如果需要，可以将合并后的列表保存到一个新的JSON文件中
+with open('combined_data.json', 'w', encoding='utf-8') as combined_json_file:
+    json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4)
--- a/datasets/processed/process_single_turn_conversation_construction.py
+++ b/datasets/processed/process_single_turn_conversation_construction.py
@ -0,0 +1,31 @@
+import json
+
+# 打开JSON文件并读取其内容
+# file_name = 'single_turn_dataset_1.json' 
+file_name = 'single_turn_dataset_2.json'  
+with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
+    format1_data = json.load(file)
+
+# n = 0
+# for i in data:
+#     i['conversation'][0]['system'] = "你是心理健康助手EmoLLM，由EmoLLM团队打造。你旨在通过专业心理咨询，协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术，一步步帮助来访者解决心理问题。"
+
+system = "你是心理健康助手EmoLLM，由EmoLLM团队打造。你旨在通过专业心理咨询，协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术，一步步帮助来访者解决心理问题。"
+
+# 转换为格式2的数据
+format2_data = []
+for item in format1_data:
+    conversation = {
+        "system": system,
+        "input": item["prompt"],
+        "output": item["completion"]
+    }
+    format2_data.append({"conversation": [conversation]})
+
+# 将转换后的数据转换为JSON格式
+
+
+with open(f'./processed_{file_name}', 'wt', encoding='utf-8') as file:
+    json.dump(format2_data, file, ensure_ascii=False, indent=4)
+
+print(format2_data[0])