diff --git a/data/qwen_gen_data.py b/data/qwen_gen_data.py index 7c6086b..f2dddb8 100644 --- a/data/qwen_gen_data.py +++ b/data/qwen_gen_data.py @@ -1,6 +1,7 @@ import json import random import argparse +import re from tqdm import tqdm @@ -109,36 +110,38 @@ if __name__ == '__main__': print(res) # 一次会话 - for itm in res.split('\n'): - if itm.startswith("病人:"): - dia_tuple.append(itm.split(":")[1]) - elif itm.startswith("医生:"): - dia_tuple.append(itm.split(":")[1]) + doctor_pattern = r'医生:(.*?)(病人:|$)' - if len(dia_tuple) == 2 and len(one_conversation['conversation']) == 0: + doctor_matches = re.findall(doctor_pattern, res, re.DOTALL) + doctor_conversations = [match[0] for match in doctor_matches] + + patient_pattern = r'病人:(.*?)医生:' + patient_matches = re.findall(patient_pattern, res, re.DOTALL) + patient_conversations = [match for match in patient_matches] + + for doc, pat in zip(doctor_conversations, patient_conversations): + if len(one_conversation['conversation']) == 0: one_conversation['conversation'].append( { "system": "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。", - "input": dia_tuple[0], - "output": dia_tuple[1] + "input": pat, + "output": doc }, ) - dia_tuple = [] - elif len(dia_tuple) == 2: + else: one_conversation['conversation'].append( { - "input": dia_tuple[0], - "output": dia_tuple[1] + "input": pat, + "output": doc }, ) - dia_tuple = [] conversation_lis.append(one_conversation) idx += 1 - # 每生成2条数据存储一次 - if (idx % 2 == 0): + # 每生成10条数据存储一次 + if (idx % 10 == 0): path = f'./{args.data}.jsonl' save_jsonl(data_lis=conversation_lis, file_path=path) conversation_lis = [] # 清空 diff --git a/data/run_qwen.bash b/data/run_qwen.bash new file mode 100644 index 0000000..cf07df9 --- /dev/null +++ b/data/run_qwen.bash @@ -0,0 +1,27 @@ +#!/bin/bash + +# 定义生活领域的列表 +areas_of_life=( + "工作" + "学业" + "生活" + "身体" + "家人" + "朋友" + "社交" + "恋爱" + "就业" + "责任" + "爱好" + "环境" + "隐私" + "安全" + "梦想" + "自由" +) + +# 使用for循环遍历数组 +for area in "${areas_of_life[@]}"; do + echo "当前生活领域: $area" + python qwen_gen_data.py --data $area +done