modified merge_jsonl and merge_jsonl_r
merge_jsonl is for merge jsonl files in a folder merge_jsonl_r if for merge jsonl files in one folder's subfolders uasge: python merge_jsonl_r.py > qwen2.txt python merge_jsonl_r.py > zhipuai.txt python merge_jsonl.py > curr.txt │ 学业_merge.json │ 家人_merge.json │ 就业_merge.json │ 工作_merge.json │ 恋爱_merge.json │ 朋友_merge.json │ 环境_merge.json │ 生活_merge.json │ 社交_merge.json │ 责任_merge.json │ 身体_merge.json │ 隐私_merge.json │ ├───学业 │ 兴奋.jsonl │ 冷静.jsonl │ 厌倦.jsonl │ 厌恶.jsonl │ 同情.jsonl │ 困惑.jsonl │ 娱乐.jsonl │ 嫉妒.jsonl │ 尴尬.jsonl │ 崇拜.jsonl │ 快乐.jsonl │ 怀旧.jsonl │ 性欲.jsonl │ 恐惧.jsonl │ 悲伤.jsonl │ 敬畏.jsonl │ 有趣.jsonl │ 欣赏.jsonl │ 浪漫.jsonl │ 渴望.jsonl │ 满意.jsonl │ 满足.jsonl │ 焦虑.jsonl │ 痛恨.jsonl │ 痛苦.jsonl │ 着迷.jsonl │ 钦佩.jsonl │ ├───家人 │ 兴奋.jsonl │ 冷静.jsonl │ 厌倦.jsonl │ 厌恶.jsonl │ 同情.jsonl │ 困惑.jsonl │ 娱乐.jsonl │ 嫉妒.jsonl
This commit is contained in:
parent
0553c3877b
commit
a38ef60058
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,6 +5,7 @@ zhipuai/
|
||||
data/
|
||||
|
||||
*.jsonl
|
||||
*.json
|
||||
# ./generate_data/*.josnl
|
||||
# ./generate_data/*/*/*.josnl
|
||||
|
||||
|
60
generate_data/final_data/merge_jsonl.py
Normal file
60
generate_data/final_data/merge_jsonl.py
Normal file
@ -0,0 +1,60 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def save_merge_json(data_lis, file_path):
|
||||
with open(file_path, 'wt', encoding='utf-8') as file:
|
||||
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
|
||||
|
||||
|
||||
def get_all_file_paths(folder_path, file_type='.jsonl'):
|
||||
# 确保传入的是一个目录
|
||||
if not os.path.isdir(folder_path):
|
||||
raise ValueError(f"{folder_path} is not a valid directory")
|
||||
|
||||
# 获取文件夹下所有文件的路径
|
||||
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
|
||||
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
|
||||
return file_paths
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
conversion_lis = []
|
||||
|
||||
folder_path = r'./'
|
||||
|
||||
merge_path = folder_path.split('/')[-1]
|
||||
try:
|
||||
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
|
||||
except:
|
||||
merge_last_path = ''
|
||||
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
|
||||
|
||||
|
||||
for path in get_all_file_paths(folder_path):
|
||||
print(path)
|
||||
|
||||
with open(path, 'rt', encoding='utf-8') as file:
|
||||
for line in file:
|
||||
# # 移除行尾的换行符
|
||||
# if line == '\n':
|
||||
# line = line.rstrip('\n')
|
||||
line = line.rstrip('\n')
|
||||
# 解析JSON
|
||||
try:
|
||||
data = json.loads(line)
|
||||
conversion_lis.append(data)
|
||||
# conversion_lis.append('\n')
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding JSON: {e}")
|
||||
|
||||
if merge_last_path!='':
|
||||
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
|
||||
elif merge_path!='':
|
||||
save_merge_json_path = rf'./{merge_path}_merge.json'
|
||||
else:
|
||||
save_merge_json_path = rf'./curr_merge.json'
|
||||
|
||||
save_merge_json(data_lis=conversion_lis,
|
||||
file_path=save_merge_json_path)
|
||||
print(len(conversion_lis),save_merge_json_path)
|
75
generate_data/final_data/merge_jsonl_r.py
Normal file
75
generate_data/final_data/merge_jsonl_r.py
Normal file
@ -0,0 +1,75 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def save_merge_json(data_lis, file_path):
|
||||
with open(file_path, 'wt', encoding='utf-8') as file:
|
||||
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
|
||||
|
||||
|
||||
def get_all_file_paths(folder_path, file_type='.jsonl'):
|
||||
# 确保传入的是一个目录
|
||||
if not os.path.isdir(folder_path):
|
||||
raise ValueError(f"{folder_path} is not a valid directory")
|
||||
|
||||
# 获取文件夹下所有文件的路径
|
||||
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
|
||||
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
|
||||
return file_paths
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt
|
||||
# data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt
|
||||
root_dir = rf'./{data_ai}/'
|
||||
|
||||
save_final_merge_json_path = f'{data_ai}_final_merge.json'
|
||||
|
||||
subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
|
||||
|
||||
final_list = []
|
||||
for folder_path in subfolders:
|
||||
conversion_lis = []
|
||||
merge_path = folder_path.split('/')[-1]
|
||||
try:
|
||||
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
|
||||
except:
|
||||
merge_last_path = ''
|
||||
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
|
||||
|
||||
|
||||
for path in get_all_file_paths(folder_path):
|
||||
print(path)
|
||||
|
||||
with open(path, 'rt', encoding='utf-8') as file:
|
||||
for line in file:
|
||||
# # 移除行尾的换行符
|
||||
# if line == '\n':
|
||||
# line = line.rstrip('\n')
|
||||
line = line.rstrip('\n')
|
||||
# 解析JSON
|
||||
try:
|
||||
data = json.loads(line)
|
||||
conversion_lis.append(data)
|
||||
# conversion_lis.append('\n')
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding JSON: {e}")
|
||||
|
||||
if merge_last_path!='':
|
||||
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
|
||||
elif merge_path!='':
|
||||
save_merge_json_path = rf'./{merge_path}_merge.json'
|
||||
else:
|
||||
save_merge_json_path = rf'./curr_merge.json'
|
||||
|
||||
save_merge_json(data_lis=conversion_lis,
|
||||
file_path=save_merge_json_path)
|
||||
|
||||
final_list = final_list+conversion_lis
|
||||
print(len(conversion_lis),len(final_list),save_merge_json_path)
|
||||
|
||||
save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
|
||||
print(save_final_merge_json_path)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user