import os import re import natsort folder_path = "output_txt" combined_text = "" # 使用自然排序来读取文件 for filename in natsort.natsorted(os.listdir(folder_path)): if filename.endswith(".txt"): file_path = os.path.join(folder_path, filename) with open(file_path, 'r', encoding='utf-8') as file: combined_text += file.read() combined_text = combined_text.replace('\n', '') # 处理连续三个或更多相同的标点符号 combined_text = re.sub(r'([。,!?:;. ·])\1{2,}', r'\1', combined_text) # 将清洗后的文本保存到一个新的文件中 with open("cleaned_data.txt", 'w', encoding='utf-8') as file: file.write(combined_text) print("数据处理完成")