ADD merge_json.py @aJupyter

2024-01-19 15:02:00 +08:00 · 2024-01-19 15:02:00 +08:00 · 8ca5f6dcd0
commit 8ca5f6dcd0
parent e89aa06300
2 changed files with 117 additions and 0 deletions
--- a/data/merge_json.py
+++ b/data/merge_json.py
@ -0,0 +1,39 @@
+import json
+import os
+
+
+def save_merge_json(data_lis, file_path):
+    import json
+
+    with open(file_path, 'wt', encoding='utf-8') as file:
+        json.dump(data_lis, file, ensure_ascii=False)
+
+
+def get_all_file_paths(folder_path):
+    # 确保传入的是一个目录
+    if not os.path.isdir(folder_path):
+        raise ValueError(f"{folder_path} is not a valid directory")
+
+    # 获取文件夹下所有文件的路径
+    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
+        folder_path) if os.path.isfile(os.path.join(folder_path, file))]
+    return file_paths
+
+
+if __name__ == '__main__':
+    conversion_lis = []
+
+    for path in get_all_file_paths('res/'):
+        print(path)
+
+        with open('res/学业.jsonl', 'rt', encoding='utf-8') as file:
+            for line in file:
+                # 移除行尾的换行符
+                line = line.rstrip('\n')
+                # 解析JSON
+                try:
+                    data = json.loads(line)
+                    conversion_lis.append(data)
+                except json.JSONDecodeError as e:
+                    print(f"Error decoding JSON: {e}")
+        save_merge_json(data_lis=conversion_lis, file_path='merge.json')
--- a/data/trans_process.py
+++ b/data/trans_process.py
@ -0,0 +1,78 @@
+import json
+from tqdm import tqdm
+
+
+def qwen_api(prompt):
+    import dashscope
+    from http import HTTPStatus
+
+    dashscope.api_key = "your key"
+    prompt = "你是一位非常擅长将英文翻译成中文的专家。请你将下面的英文翻译成正确地道的中文，要求只返回翻译的中文句子:\n" + prompt
+    response = dashscope.Generation.call(
+        model='qwen-max',
+        prompt=prompt,
+        history=[],
+    )
+
+    if response.status_code == HTTPStatus.OK:
+        result = response.output.text
+        # print(result)
+    else:
+        result = 'ERROR'
+    return result
+
+
+def get_conversation_list():
+    with open('./ESConv.json', 'rt', encoding='utf-8') as file:
+        data = json.load(file)
+
+    idx = 0
+    conversation_list = []
+    for itm in tqdm(data):
+        one_conversation = {
+            "conversation": []
+        }
+        dia_tuple = []
+        for dia in tqdm(itm['dialog']):
+            # print(dia['speaker'], dia['content'])
+            if dia['speaker'] == 'seeker':
+                dia_tuple.append(qwen_api(dia['content']))
+            elif dia['speaker'] == 'supporter':
+                dia_tuple.append(qwen_api(dia['content']))
+            else:
+                exit("不存在角色!")
+
+            if len(dia_tuple) == 2 and len(one_conversation['conversation']) == 0:
+                one_conversation['conversation'].append(
+                    {
+                        "system": "现在你是一个心理专家，我有一些心理问题，请你用专业的知识帮我解决。",
+                        "input": dia_tuple[0],
+                        "output": dia_tuple[1]
+                    },
+                )
+                dia_tuple = []
+
+            elif len(dia_tuple) == 2:
+                one_conversation['conversation'].append(
+                    {
+                        "input": dia_tuple[0],
+                        "output": dia_tuple[1]
+                    },
+                )
+                dia_tuple = []
+
+        conversation_list.append(one_conversation)
+        idx += 1
+
+        # if (idx == 1):
+        #     print(conversation_list)
+        #     break
+        print(idx)
+    return conversation_list
+
+
+if __name__ == '__main__':
+    conversation_list = get_conversation_list()
+    # 将conversation_list保存为一个json文件
+    with open('conversation_list.json', 'wt', encoding='utf-8') as f:
+        json.dump(conversation_list, f, ensure_ascii=False)