update deduplicate.py
This commit is contained in:
parent
41744ed604
commit
77ff2d079c
@ -48,6 +48,8 @@ def deduplicate_json(data_list, threshold=0.8):
|
|||||||
keep = []
|
keep = []
|
||||||
duplicate = []
|
duplicate = []
|
||||||
for item in data_list:
|
for item in data_list:
|
||||||
|
if not item['conversation']:
|
||||||
|
continue
|
||||||
# min_hash = hash_dict(item)
|
# min_hash = hash_dict(item)
|
||||||
sim_hash = hash_dict(item)
|
sim_hash = hash_dict(item)
|
||||||
# print(f'min_hash: {min_hash}')
|
# print(f'min_hash: {min_hash}')
|
||||||
|
Loading…
Reference in New Issue
Block a user