Merge pull request #4 from aJupyter/dev

ADD run_qwen.bash
This commit is contained in:
xzwang 2024-01-18 22:50:58 +08:00 committed by GitHub
commit c5302d9198
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 45 additions and 15 deletions

View File

@ -1,6 +1,7 @@
import json
import random
import argparse
import re
from tqdm import tqdm
@ -109,36 +110,38 @@ if __name__ == '__main__':
print(res)
# 一次会话
for itm in res.split('\n'):
if itm.startswith("病人:"):
dia_tuple.append(itm.split("")[1])
elif itm.startswith("医生:"):
dia_tuple.append(itm.split("")[1])
doctor_pattern = r'医生:(.*?)(病人:|$)'
if len(dia_tuple) == 2 and len(one_conversation['conversation']) == 0:
doctor_matches = re.findall(doctor_pattern, res, re.DOTALL)
doctor_conversations = [match[0] for match in doctor_matches]
patient_pattern = r'病人:(.*?)医生:'
patient_matches = re.findall(patient_pattern, res, re.DOTALL)
patient_conversations = [match for match in patient_matches]
for doc, pat in zip(doctor_conversations, patient_conversations):
if len(one_conversation['conversation']) == 0:
one_conversation['conversation'].append(
{
"system": "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。",
"input": dia_tuple[0],
"output": dia_tuple[1]
"input": pat,
"output": doc
},
)
dia_tuple = []
elif len(dia_tuple) == 2:
else:
one_conversation['conversation'].append(
{
"input": dia_tuple[0],
"output": dia_tuple[1]
"input": pat,
"output": doc
},
)
dia_tuple = []
conversation_lis.append(one_conversation)
idx += 1
# 每生成2条数据存储一次
if (idx % 2 == 0):
# 每生成10条数据存储一次
if (idx % 10 == 0):
path = f'./{args.data}.jsonl'
save_jsonl(data_lis=conversation_lis, file_path=path)
conversation_lis = [] # 清空

27
data/run_qwen.bash Normal file
View File

@ -0,0 +1,27 @@
#!/bin/bash
# 定义生活领域的列表
areas_of_life=(
"工作"
"学业"
"生活"
"身体"
"家人"
"朋友"
"社交"
"恋爱"
"就业"
"责任"
"爱好"
"环境"
"隐私"
"安全"
"梦想"
"自由"
)
# 使用for循环遍历数组
for area in "${areas_of_life[@]}"; do
echo "当前生活领域: $area"
python qwen_gen_data.py --data $area
done