diff --git a/.gitignore b/.gitignore index 7467647..b2c615a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ zhipuai/ data/ *.jsonl -*.json # ./generate_data/*.josnl # ./generate_data/*/*/*.josnl diff --git a/generate_data/final_data/merge_jsonl.py b/generate_data/final_data/merge_jsonl.py deleted file mode 100644 index b8edd10..0000000 --- a/generate_data/final_data/merge_jsonl.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import os - - -def save_merge_json(data_lis, file_path): - with open(file_path, 'wt', encoding='utf-8') as file: - json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) - - -def get_all_file_paths(folder_path, file_type='.jsonl'): - # 确保传入的是一个目录 - if not os.path.isdir(folder_path): - raise ValueError(f"{folder_path} is not a valid directory") - - # 获取文件夹下所有文件的路径 - file_paths = [os.path.join(folder_path, file) for file in os.listdir( - folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] - return file_paths - - -if __name__ == '__main__': - conversion_lis = [] - - folder_path = r'./' - - merge_path = folder_path.split('/')[-1] - try: - merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' - except: - merge_last_path = '' - print(f'merge_path={merge_path},merge_last_path={merge_last_path}') - - - for path in get_all_file_paths(folder_path): - print(path) - - with open(path, 'rt', encoding='utf-8') as file: - for line in file: - # # 移除行尾的换行符 - # if line == '\n': - # line = line.rstrip('\n') - line = line.rstrip('\n') - # 解析JSON - try: - data = json.loads(line) - conversion_lis.append(data) - # conversion_lis.append('\n') - except json.JSONDecodeError as e: - print(f"Error decoding JSON: {e}") - - if merge_last_path!='': - save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' - elif merge_path!='': - save_merge_json_path = rf'./{merge_path}_merge.json' - else: - save_merge_json_path = rf'./curr_merge.json' - - save_merge_json(data_lis=conversion_lis, - file_path=save_merge_json_path) - print(len(conversion_lis),save_merge_json_path) diff --git a/generate_data/final_data/merge_jsonl_r.py b/generate_data/final_data/merge_jsonl_r.py deleted file mode 100644 index a29c951..0000000 --- a/generate_data/final_data/merge_jsonl_r.py +++ /dev/null @@ -1,75 +0,0 @@ -import json -import os - - -def save_merge_json(data_lis, file_path): - with open(file_path, 'wt', encoding='utf-8') as file: - json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) - - -def get_all_file_paths(folder_path, file_type='.jsonl'): - # 确保传入的是一个目录 - if not os.path.isdir(folder_path): - raise ValueError(f"{folder_path} is not a valid directory") - - # 获取文件夹下所有文件的路径 - file_paths = [os.path.join(folder_path, file) for file in os.listdir( - folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] - return file_paths - - -if __name__ == '__main__': - - data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt - # data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt - root_dir = rf'./{data_ai}/' - - save_final_merge_json_path = f'{data_ai}_final_merge.json' - - subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))] - - final_list = [] - for folder_path in subfolders: - conversion_lis = [] - merge_path = folder_path.split('/')[-1] - try: - merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' - except: - merge_last_path = '' - print(f'merge_path={merge_path},merge_last_path={merge_last_path}') - - - for path in get_all_file_paths(folder_path): - print(path) - - with open(path, 'rt', encoding='utf-8') as file: - for line in file: - # # 移除行尾的换行符 - # if line == '\n': - # line = line.rstrip('\n') - line = line.rstrip('\n') - # 解析JSON - try: - data = json.loads(line) - conversion_lis.append(data) - # conversion_lis.append('\n') - except json.JSONDecodeError as e: - print(f"Error decoding JSON: {e}") - - if merge_last_path!='': - save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' - elif merge_path!='': - save_merge_json_path = rf'./{merge_path}_merge.json' - else: - save_merge_json_path = rf'./curr_merge.json' - - save_merge_json(data_lis=conversion_lis, - file_path=save_merge_json_path) - - final_list = final_list+conversion_lis - print(len(conversion_lis),len(final_list),save_merge_json_path) - - save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path) - print(save_final_merge_json_path) - -