Merge pull request #112 from zealot52099/dev

update deduplicate.py
This commit is contained in:
xzw 2024-03-20 23:20:14 +08:00 committed by GitHub
commit 817c25b349
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -48,6 +48,8 @@ def deduplicate_json(data_list, threshold=0.8):
keep = [] keep = []
duplicate = [] duplicate = []
for item in data_list: for item in data_list:
if not item['conversation']:
continue
# min_hash = hash_dict(item) # min_hash = hash_dict(item)
sim_hash = hash_dict(item) sim_hash = hash_dict(item)
# print(f'min_hash: {min_hash}') # print(f'min_hash: {min_hash}')