From 8ca5f6dcd04f9f5a09e1c2ac2b0d0c12803f52d4 Mon Sep 17 00:00:00 2001 From: jupyter Date: Fri, 19 Jan 2024 15:02:00 +0800 Subject: [PATCH] ADD merge_json.py @aJupyter --- data/merge_json.py | 39 ++++++++++++++++++++++ data/trans_process.py | 78 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 data/merge_json.py create mode 100644 data/trans_process.py diff --git a/data/merge_json.py b/data/merge_json.py new file mode 100644 index 0000000..5a474ac --- /dev/null +++ b/data/merge_json.py @@ -0,0 +1,39 @@ +import json +import os + + +def save_merge_json(data_lis, file_path): + import json + + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False) + + +def get_all_file_paths(folder_path): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file))] + return file_paths + + +if __name__ == '__main__': + conversion_lis = [] + + for path in get_all_file_paths('res/'): + print(path) + + with open('res/学业.jsonl', 'rt', encoding='utf-8') as file: + for line in file: + # 移除行尾的换行符 + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + save_merge_json(data_lis=conversion_lis, file_path='merge.json') diff --git a/data/trans_process.py b/data/trans_process.py new file mode 100644 index 0000000..3999114 --- /dev/null +++ b/data/trans_process.py @@ -0,0 +1,78 @@ +import json +from tqdm import tqdm + + +def qwen_api(prompt): + import dashscope + from http import HTTPStatus + + dashscope.api_key = "your key" + prompt = "你是一位非常擅长将英文翻译成中文的专家。请你将下面的英文翻译成正确地道的中文,要求只返回翻译的中文句子:\n" + prompt + response = dashscope.Generation.call( + model='qwen-max', + prompt=prompt, + history=[], + ) + + if response.status_code == HTTPStatus.OK: + result = response.output.text + # print(result) + else: + result = 'ERROR' + return result + + +def get_conversation_list(): + with open('./ESConv.json', 'rt', encoding='utf-8') as file: + data = json.load(file) + + idx = 0 + conversation_list = [] + for itm in tqdm(data): + one_conversation = { + "conversation": [] + } + dia_tuple = [] + for dia in tqdm(itm['dialog']): + # print(dia['speaker'], dia['content']) + if dia['speaker'] == 'seeker': + dia_tuple.append(qwen_api(dia['content'])) + elif dia['speaker'] == 'supporter': + dia_tuple.append(qwen_api(dia['content'])) + else: + exit("不存在角色!") + + if len(dia_tuple) == 2 and len(one_conversation['conversation']) == 0: + one_conversation['conversation'].append( + { + "system": "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。", + "input": dia_tuple[0], + "output": dia_tuple[1] + }, + ) + dia_tuple = [] + + elif len(dia_tuple) == 2: + one_conversation['conversation'].append( + { + "input": dia_tuple[0], + "output": dia_tuple[1] + }, + ) + dia_tuple = [] + + conversation_list.append(one_conversation) + idx += 1 + + # if (idx == 1): + # print(conversation_list) + # break + print(idx) + return conversation_list + + +if __name__ == '__main__': + conversation_list = get_conversation_list() + # 将conversation_list保存为一个json文件 + with open('conversation_list.json', 'wt', encoding='utf-8') as f: + json.dump(conversation_list, f, ensure_ascii=False)