Merge pull request #103 from zealot52099/main

add deduplicate.py
This commit is contained in:
xzw 2024-03-19 17:09:43 +08:00 committed by GitHub
commit 19724be6b0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

68
datasets/deduplicate.py Normal file
View File

@ -0,0 +1,68 @@
import json
from loguru import logger
import os
from datasketch import MinHash
from hashlib import md5
def is_json_file(filename):
return filename.endswith('.json')
# 绝对匹配
def is_duplicate_absolutely(d1, d2):
return md5(d1.encode('utf-8')).hexdigest() == md5(d2.encode('utf-8')).hexdigest()
# 使用MinHash生成器计算dict的签名
def hash_dict(dict_obj):
m = MinHash()
for key, value in sorted(dict_obj.items()):
# 对于非str类型值需要先转为str
m.update(str(value).encode('utf8'))
return m
# 使用绝对匹配和MinHash对dict列表去重
def deduplicate_json(data_list, threshold=0.8):
seen_hashes = []
duplicates_removed = []
for item in data_list:
# print(item)
# print('###########')
min_hash = hash_dict(item)
# print(f'min_hash: {min_hash}')
# 绝对匹配去重
if not any(is_duplicate_absolutely(str(item), str(existing)) for existing in duplicates_removed):
# MinHash相似性去重
has_similar = False
for stored_min_hash, stored_text in seen_hashes:
if stored_min_hash.jaccard(min_hash) > threshold:
has_similar = True
break
if not has_similar:
seen_hashes.append((min_hash,item))
duplicates_removed.append(item)
return duplicates_removed
if __name__ == '__main__':
data_ai = 'qwen'
root_dir = rf'./{data_ai}/'
dedup_output_dir = os.path.join(root_dir,'dedup')
if not os.path.exists(dedup_output_dir):
os.mkdir(dedup_output_dir)
if not os.path.exists(root_dir):
logger.error(f"folder {root_dir} not exist" )
else:
for file in os.listdir(root_dir):
file_path = os.path.join(root_dir, file)
if os.path.isfile(file_path):
print(f'file name: {file_path}')
if is_json_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
dedup_data = deduplicate_json(data)
with open(os.path.join(root_dir, 'dedup','dedup_' + file), 'w', encoding='utf-8') as output_file:
json.dump(dedup_data, output_file, ensure_ascii=False, indent=4)