From a38ef600587df3f036c4ad9572998d6e4882fed2 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Mon, 18 Mar 2024 20:16:39 +0900 Subject: [PATCH] modified merge_jsonl and merge_jsonl_r MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit merge_jsonl is for merge jsonl files in a folder merge_jsonl_r if for merge jsonl files in one folder's subfolders uasge: python merge_jsonl_r.py > qwen2.txt python merge_jsonl_r.py > zhipuai.txt python merge_jsonl.py > curr.txt │ 学业_merge.json │ 家人_merge.json │ 就业_merge.json │ 工作_merge.json │ 恋爱_merge.json │ 朋友_merge.json │ 环境_merge.json │ 生活_merge.json │ 社交_merge.json │ 责任_merge.json │ 身体_merge.json │ 隐私_merge.json │ ├───学业 │ 兴奋.jsonl │ 冷静.jsonl │ 厌倦.jsonl │ 厌恶.jsonl │ 同情.jsonl │ 困惑.jsonl │ 娱乐.jsonl │ 嫉妒.jsonl │ 尴尬.jsonl │ 崇拜.jsonl │ 快乐.jsonl │ 怀旧.jsonl │ 性欲.jsonl │ 恐惧.jsonl │ 悲伤.jsonl │ 敬畏.jsonl │ 有趣.jsonl │ 欣赏.jsonl │ 浪漫.jsonl │ 渴望.jsonl │ 满意.jsonl │ 满足.jsonl │ 焦虑.jsonl │ 痛恨.jsonl │ 痛苦.jsonl │ 着迷.jsonl │ 钦佩.jsonl │ ├───家人 │ 兴奋.jsonl │ 冷静.jsonl │ 厌倦.jsonl │ 厌恶.jsonl │ 同情.jsonl │ 困惑.jsonl │ 娱乐.jsonl │ 嫉妒.jsonl --- .gitignore | 1 + generate_data/final_data/merge_jsonl.py | 60 ++++++++++++++++++ generate_data/final_data/merge_jsonl_r.py | 75 +++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 generate_data/final_data/merge_jsonl.py create mode 100644 generate_data/final_data/merge_jsonl_r.py diff --git a/.gitignore b/.gitignore index b2c615a..7467647 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ zhipuai/ data/ *.jsonl +*.json # ./generate_data/*.josnl # ./generate_data/*/*/*.josnl diff --git a/generate_data/final_data/merge_jsonl.py b/generate_data/final_data/merge_jsonl.py new file mode 100644 index 0000000..b8edd10 --- /dev/null +++ b/generate_data/final_data/merge_jsonl.py @@ -0,0 +1,60 @@ +import json +import os + + +def save_merge_json(data_lis, file_path): + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) + + +def get_all_file_paths(folder_path, file_type='.jsonl'): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] + return file_paths + + +if __name__ == '__main__': + conversion_lis = [] + + folder_path = r'./' + + merge_path = folder_path.split('/')[-1] + try: + merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' + except: + merge_last_path = '' + print(f'merge_path={merge_path},merge_last_path={merge_last_path}') + + + for path in get_all_file_paths(folder_path): + print(path) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # # 移除行尾的换行符 + # if line == '\n': + # line = line.rstrip('\n') + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + # conversion_lis.append('\n') + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + + if merge_last_path!='': + save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' + elif merge_path!='': + save_merge_json_path = rf'./{merge_path}_merge.json' + else: + save_merge_json_path = rf'./curr_merge.json' + + save_merge_json(data_lis=conversion_lis, + file_path=save_merge_json_path) + print(len(conversion_lis),save_merge_json_path) diff --git a/generate_data/final_data/merge_jsonl_r.py b/generate_data/final_data/merge_jsonl_r.py new file mode 100644 index 0000000..a29c951 --- /dev/null +++ b/generate_data/final_data/merge_jsonl_r.py @@ -0,0 +1,75 @@ +import json +import os + + +def save_merge_json(data_lis, file_path): + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) + + +def get_all_file_paths(folder_path, file_type='.jsonl'): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] + return file_paths + + +if __name__ == '__main__': + + data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt + # data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt + root_dir = rf'./{data_ai}/' + + save_final_merge_json_path = f'{data_ai}_final_merge.json' + + subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))] + + final_list = [] + for folder_path in subfolders: + conversion_lis = [] + merge_path = folder_path.split('/')[-1] + try: + merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' + except: + merge_last_path = '' + print(f'merge_path={merge_path},merge_last_path={merge_last_path}') + + + for path in get_all_file_paths(folder_path): + print(path) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # # 移除行尾的换行符 + # if line == '\n': + # line = line.rstrip('\n') + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + # conversion_lis.append('\n') + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + + if merge_last_path!='': + save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' + elif merge_path!='': + save_merge_json_path = rf'./{merge_path}_merge.json' + else: + save_merge_json_path = rf'./curr_merge.json' + + save_merge_json(data_lis=conversion_lis, + file_path=save_merge_json_path) + + final_list = final_list+conversion_lis + print(len(conversion_lis),len(final_list),save_merge_json_path) + + save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path) + print(save_final_merge_json_path) + +