Merge pull request #89 from chg0901/main
optimizing the merge_json.py + moving files +Update zhipuai_gen_data.py 添加glm-4 response获取异常处理
This commit is contained in:
commit
12eeaf8cc1
6
.gitignore
vendored
6
.gitignore
vendored
@ -6,6 +6,11 @@ data/
|
|||||||
pdf/
|
pdf/
|
||||||
.idea/
|
.idea/
|
||||||
|
|
||||||
|
*.jsonl
|
||||||
|
*.json
|
||||||
|
# ./generate_data/*.josnl
|
||||||
|
# ./generate_data/*/*/*.josnl
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
@ -169,3 +174,4 @@ cython_debug/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
60
generate_data/final_data/merge_jsonl.py
Normal file
60
generate_data/final_data/merge_jsonl.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def save_merge_json(data_lis, file_path):
|
||||||
|
with open(file_path, 'wt', encoding='utf-8') as file:
|
||||||
|
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_file_paths(folder_path, file_type='.jsonl'):
|
||||||
|
# 确保传入的是一个目录
|
||||||
|
if not os.path.isdir(folder_path):
|
||||||
|
raise ValueError(f"{folder_path} is not a valid directory")
|
||||||
|
|
||||||
|
# 获取文件夹下所有文件的路径
|
||||||
|
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
|
||||||
|
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
|
||||||
|
return file_paths
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
conversion_lis = []
|
||||||
|
|
||||||
|
folder_path = r'./'
|
||||||
|
|
||||||
|
merge_path = folder_path.split('/')[-1]
|
||||||
|
try:
|
||||||
|
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
|
||||||
|
except:
|
||||||
|
merge_last_path = ''
|
||||||
|
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
|
||||||
|
|
||||||
|
|
||||||
|
for path in get_all_file_paths(folder_path):
|
||||||
|
print(path)
|
||||||
|
|
||||||
|
with open(path, 'rt', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# # 移除行尾的换行符
|
||||||
|
# if line == '\n':
|
||||||
|
# line = line.rstrip('\n')
|
||||||
|
line = line.rstrip('\n')
|
||||||
|
# 解析JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
conversion_lis.append(data)
|
||||||
|
# conversion_lis.append('\n')
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error decoding JSON: {e}")
|
||||||
|
|
||||||
|
if merge_last_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
|
||||||
|
elif merge_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_path}_merge.json'
|
||||||
|
else:
|
||||||
|
save_merge_json_path = rf'./curr_merge.json'
|
||||||
|
|
||||||
|
save_merge_json(data_lis=conversion_lis,
|
||||||
|
file_path=save_merge_json_path)
|
||||||
|
print(len(conversion_lis),save_merge_json_path)
|
75
generate_data/final_data/merge_jsonl_r.py
Normal file
75
generate_data/final_data/merge_jsonl_r.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def save_merge_json(data_lis, file_path):
|
||||||
|
with open(file_path, 'wt', encoding='utf-8') as file:
|
||||||
|
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_file_paths(folder_path, file_type='.jsonl'):
|
||||||
|
# 确保传入的是一个目录
|
||||||
|
if not os.path.isdir(folder_path):
|
||||||
|
raise ValueError(f"{folder_path} is not a valid directory")
|
||||||
|
|
||||||
|
# 获取文件夹下所有文件的路径
|
||||||
|
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
|
||||||
|
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
|
||||||
|
return file_paths
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt
|
||||||
|
# data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt
|
||||||
|
root_dir = rf'./{data_ai}/'
|
||||||
|
|
||||||
|
save_final_merge_json_path = f'{data_ai}_final_merge.json'
|
||||||
|
|
||||||
|
subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
|
||||||
|
|
||||||
|
final_list = []
|
||||||
|
for folder_path in subfolders:
|
||||||
|
conversion_lis = []
|
||||||
|
merge_path = folder_path.split('/')[-1]
|
||||||
|
try:
|
||||||
|
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
|
||||||
|
except:
|
||||||
|
merge_last_path = ''
|
||||||
|
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
|
||||||
|
|
||||||
|
|
||||||
|
for path in get_all_file_paths(folder_path):
|
||||||
|
print(path)
|
||||||
|
|
||||||
|
with open(path, 'rt', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# # 移除行尾的换行符
|
||||||
|
# if line == '\n':
|
||||||
|
# line = line.rstrip('\n')
|
||||||
|
line = line.rstrip('\n')
|
||||||
|
# 解析JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
conversion_lis.append(data)
|
||||||
|
# conversion_lis.append('\n')
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error decoding JSON: {e}")
|
||||||
|
|
||||||
|
if merge_last_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
|
||||||
|
elif merge_path!='':
|
||||||
|
save_merge_json_path = rf'./{merge_path}_merge.json'
|
||||||
|
else:
|
||||||
|
save_merge_json_path = rf'./curr_merge.json'
|
||||||
|
|
||||||
|
save_merge_json(data_lis=conversion_lis,
|
||||||
|
file_path=save_merge_json_path)
|
||||||
|
|
||||||
|
final_list = final_list+conversion_lis
|
||||||
|
print(len(conversion_lis),len(final_list),save_merge_json_path)
|
||||||
|
|
||||||
|
save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
|
||||||
|
print(save_final_merge_json_path)
|
||||||
|
|
||||||
|
|
@ -34,11 +34,21 @@ def zhipu_api(data, emo):
|
|||||||
|
|
||||||
top_p = round(random.uniform(0.1, 0.9), 2)
|
top_p = round(random.uniform(0.1, 0.9), 2)
|
||||||
messages = getText('user', prompt)
|
messages = getText('user', prompt)
|
||||||
response = client.chat.completions.create(
|
|
||||||
model='glm-4',
|
# Error code: 400, with error text {"error":{"code":"1301","message":
|
||||||
messages=messages,
|
# "系统检测到输入或生成内容可能包含不安全或敏感内容,请您避免输入易产生敏感内容的提示语,感谢您的配合。"}}
|
||||||
top_p=top_p,
|
try:
|
||||||
)
|
response = client.chat.completions.create(
|
||||||
|
model='glm-4',
|
||||||
|
messages=messages,
|
||||||
|
top_p=top_p,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model='glm-4',
|
||||||
|
messages=messages,
|
||||||
|
top_p=top_p,
|
||||||
|
)
|
||||||
|
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user