From 77ff2d079ccbe136650f02f1e902ffb811c0eb9e Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Wed, 20 Mar 2024 23:08:36 +0800 Subject: [PATCH] update deduplicate.py --- datasets/deduplicate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datasets/deduplicate.py b/datasets/deduplicate.py index a020fd9..54137cb 100644 --- a/datasets/deduplicate.py +++ b/datasets/deduplicate.py @@ -48,6 +48,8 @@ def deduplicate_json(data_list, threshold=0.8): keep = [] duplicate = [] for item in data_list: + if not item['conversation']: + continue # min_hash = hash_dict(item) sim_hash = hash_dict(item) # print(f'min_hash: {min_hash}')