ADD merge_json.py @aJupyter
This commit is contained in:
parent
e89aa06300
commit
8ca5f6dcd0
39
data/merge_json.py
Normal file
39
data/merge_json.py
Normal file
@ -0,0 +1,39 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def save_merge_json(data_lis, file_path):
|
||||
import json
|
||||
|
||||
with open(file_path, 'wt', encoding='utf-8') as file:
|
||||
json.dump(data_lis, file, ensure_ascii=False)
|
||||
|
||||
|
||||
def get_all_file_paths(folder_path):
|
||||
# 确保传入的是一个目录
|
||||
if not os.path.isdir(folder_path):
|
||||
raise ValueError(f"{folder_path} is not a valid directory")
|
||||
|
||||
# 获取文件夹下所有文件的路径
|
||||
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
|
||||
folder_path) if os.path.isfile(os.path.join(folder_path, file))]
|
||||
return file_paths
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
conversion_lis = []
|
||||
|
||||
for path in get_all_file_paths('res/'):
|
||||
print(path)
|
||||
|
||||
with open('res/学业.jsonl', 'rt', encoding='utf-8') as file:
|
||||
for line in file:
|
||||
# 移除行尾的换行符
|
||||
line = line.rstrip('\n')
|
||||
# 解析JSON
|
||||
try:
|
||||
data = json.loads(line)
|
||||
conversion_lis.append(data)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding JSON: {e}")
|
||||
save_merge_json(data_lis=conversion_lis, file_path='merge.json')
|
78
data/trans_process.py
Normal file
78
data/trans_process.py
Normal file
@ -0,0 +1,78 @@
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def qwen_api(prompt):
|
||||
import dashscope
|
||||
from http import HTTPStatus
|
||||
|
||||
dashscope.api_key = "your key"
|
||||
prompt = "你是一位非常擅长将英文翻译成中文的专家。请你将下面的英文翻译成正确地道的中文,要求只返回翻译的中文句子:\n" + prompt
|
||||
response = dashscope.Generation.call(
|
||||
model='qwen-max',
|
||||
prompt=prompt,
|
||||
history=[],
|
||||
)
|
||||
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
result = response.output.text
|
||||
# print(result)
|
||||
else:
|
||||
result = 'ERROR'
|
||||
return result
|
||||
|
||||
|
||||
def get_conversation_list():
|
||||
with open('./ESConv.json', 'rt', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
idx = 0
|
||||
conversation_list = []
|
||||
for itm in tqdm(data):
|
||||
one_conversation = {
|
||||
"conversation": []
|
||||
}
|
||||
dia_tuple = []
|
||||
for dia in tqdm(itm['dialog']):
|
||||
# print(dia['speaker'], dia['content'])
|
||||
if dia['speaker'] == 'seeker':
|
||||
dia_tuple.append(qwen_api(dia['content']))
|
||||
elif dia['speaker'] == 'supporter':
|
||||
dia_tuple.append(qwen_api(dia['content']))
|
||||
else:
|
||||
exit("不存在角色!")
|
||||
|
||||
if len(dia_tuple) == 2 and len(one_conversation['conversation']) == 0:
|
||||
one_conversation['conversation'].append(
|
||||
{
|
||||
"system": "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。",
|
||||
"input": dia_tuple[0],
|
||||
"output": dia_tuple[1]
|
||||
},
|
||||
)
|
||||
dia_tuple = []
|
||||
|
||||
elif len(dia_tuple) == 2:
|
||||
one_conversation['conversation'].append(
|
||||
{
|
||||
"input": dia_tuple[0],
|
||||
"output": dia_tuple[1]
|
||||
},
|
||||
)
|
||||
dia_tuple = []
|
||||
|
||||
conversation_list.append(one_conversation)
|
||||
idx += 1
|
||||
|
||||
# if (idx == 1):
|
||||
# print(conversation_list)
|
||||
# break
|
||||
print(idx)
|
||||
return conversation_list
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
conversation_list = get_conversation_list()
|
||||
# 将conversation_list保存为一个json文件
|
||||
with open('conversation_list.json', 'wt', encoding='utf-8') as f:
|
||||
json.dump(conversation_list, f, ensure_ascii=False)
|
Loading…
Reference in New Issue
Block a user