diff --git a/finetune/ft_config.py b/config/ft_config.py similarity index 100% rename from finetune/ft_config.py rename to config/ft_config.py diff --git a/finetune/qwen_7b_chat_qlora_e3.py b/config/qwen_7b_chat_qlora_e3.py similarity index 100% rename from finetune/qwen_7b_chat_qlora_e3.py rename to config/qwen_7b_chat_qlora_e3.py diff --git a/scripts/check.py b/scripts/check.py new file mode 100644 index 0000000..2557be4 --- /dev/null +++ b/scripts/check.py @@ -0,0 +1,45 @@ +import os +import json + +def get_all_file_paths(folder_path, suffix=''): + files = os.listdir(folder_path) + path = [] + for file in files: + file_path = os.path.join(folder_path, file) + if os.path.isdir(file_path): + path.extend(get_all_file_paths(file_path)) + else: + if file_path.endswith(suffix): + path.append(file_path) + return path + +def check(filepath): + with open(path, 'rt', encoding='utf-8') as file: + data = json.load(file) + for idx, item in enumerate(data): + dict_item = dict(item) + for conversation in dict_item: + if conversation != 'conversation': + return 'found error in file: ' + filepath + ' at conversation index: ' + str(idx) + try: + if len(dict_item[conversation]) == 0: + return 'found error in file: ' + filepath + ' at conversation index: ' + str(idx) + except: + return 'found error in file: ' + filepath + ' at conversation index: ' + str(idx) + for in_out in dict_item[conversation]: + for key in in_out: + if key != 'system' and key != 'input' and key != 'output': + return 'found error in file: ' + filepath + ' at conversation index: ' + str(idx) + try : + if len(in_out[key]) == 0: + return 'found error in file: ' + filepath + ' at conversation index: ' + str(idx) + except: + return 'found error in file: ' + filepath + ' at conversation index: ' + str(idx) + return 'no error in file: ' + filepath + + +if __name__ == '__main__': + dir_path = '.' + paths = get_all_file_paths(dir_path, suffix='.json') + for path in paths: + print(check(filepath=path)) \ No newline at end of file diff --git a/scripts/merge_json.py b/scripts/merge_json.py index 0171df9..2b34f09 100644 --- a/scripts/merge_json.py +++ b/scripts/merge_json.py @@ -7,7 +7,8 @@ def save_merge_json(data_lis, file_path): json.dump(data_lis, file, indent=4, ensure_ascii=False) -def get_all_file_paths(folder_path): +def get_all_file_paths(folder_path, suffix=''): + print(folder_path) files = os.listdir(folder_path) path = [] for file in files: @@ -15,26 +16,26 @@ def get_all_file_paths(folder_path): if os.path.isdir(file_path): path.extend(get_all_file_paths(file_path)) else: - path.append(file_path) + if file_path.endswith(suffix): + path.append(file_path) return path if __name__ == '__main__': conversion_lis = [] - folder_path = '' # input - merge_path = '' # input - paths = get_all_file_paths(folder_path=folder_path) + folder_path = './' # input + merge_path = 'merge.json' # input + paths = get_all_file_paths(folder_path=folder_path, suffix='.json') for path in paths: print(path) with open(path, 'rt', encoding='utf-8') as lines: + datas = [] for line in lines: - # 移除行尾的换行符 - line.rstrip('\n') - # 解析JSON - try: - data = json.loads(line) - conversion_lis.append(data) - except json.JSONDecodeError as e: - print(f"Error decoding JSON: {e}") + datas.append(line) + try: + datas = json.loads(''.join(datas)) + conversion_lis.extend(datas) + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") save_merge_json(data_lis=conversion_lis, file_path=merge_path) \ No newline at end of file