OliveSensorAPI/IOTLLM/generate_data/EC_process/processPDF/mergeTXT.py

26 lines
731 B
Python

import os
import re
import natsort
folder_path = "output_txt"
combined_text = ""
# 使用自然排序来读取文件
for filename in natsort.natsorted(os.listdir(folder_path)):
if filename.endswith(".txt"):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
combined_text += file.read()
combined_text = combined_text.replace('\n', '')
# 处理连续三个或更多相同的标点符号
combined_text = re.sub(r'([。,!?:;. ·])\1{2,}', r'\1', combined_text)
# 将清洗后的文本保存到一个新的文件中
with open("cleaned_data.txt", 'w', encoding='utf-8') as file:
file.write(combined_text)
print("数据处理完成")