update deduplicate.py

This commit is contained in:
zealot52099 2024-03-20 23:08:36 +08:00
parent 41744ed604
commit 77ff2d079c

View File

@ -48,6 +48,8 @@ def deduplicate_json(data_list, threshold=0.8):
keep = [] keep = []
duplicate = [] duplicate = []
for item in data_list: for item in data_list:
if not item['conversation']:
continue
# min_hash = hash_dict(item) # min_hash = hash_dict(item)
sim_hash = hash_dict(item) sim_hash = hash_dict(item)
# print(f'min_hash: {min_hash}') # print(f'min_hash: {min_hash}')