add deduplicate.py
This commit is contained in:
		
							parent
							
								
									c7d916bf4f
								
							
						
					
					
						commit
						861f12d47a
					
				
							
								
								
									
										68
									
								
								datasets/deduplicate.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								datasets/deduplicate.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,68 @@ | ||||
| import json | ||||
| from loguru import logger | ||||
| import os | ||||
| from datasketch import MinHash | ||||
| from hashlib import md5 | ||||
| 
 | ||||
| def is_json_file(filename): | ||||
|     return filename.endswith('.json') | ||||
| 
 | ||||
| # 绝对匹配 | ||||
| def is_duplicate_absolutely(d1, d2): | ||||
|     return md5(d1.encode('utf-8')).hexdigest() == md5(d2.encode('utf-8')).hexdigest() | ||||
| 
 | ||||
| # 使用MinHash生成器计算dict的签名 | ||||
| def hash_dict(dict_obj): | ||||
|     m = MinHash() | ||||
|     for key, value in sorted(dict_obj.items()): | ||||
|         # 对于非str类型值需要先转为str | ||||
|         m.update(str(value).encode('utf8')) | ||||
|     return m | ||||
| 
 | ||||
| # 使用绝对匹配和MinHash对dict列表去重 | ||||
| def deduplicate_json(data_list, threshold=0.8): | ||||
|     seen_hashes = [] | ||||
|     duplicates_removed = [] | ||||
| 
 | ||||
|     for item in data_list: | ||||
|         # print(item) | ||||
|         # print('###########') | ||||
|         min_hash = hash_dict(item) | ||||
|         # print(f'min_hash: {min_hash}') | ||||
| 
 | ||||
|         # 绝对匹配去重 | ||||
|         if not any(is_duplicate_absolutely(str(item), str(existing)) for existing in duplicates_removed): | ||||
|             # MinHash相似性去重  | ||||
|             has_similar = False | ||||
|             for stored_min_hash, stored_text in seen_hashes: | ||||
|                 if stored_min_hash.jaccard(min_hash) > threshold: | ||||
|                     has_similar = True | ||||
|                     break | ||||
|             if not has_similar: | ||||
|                 seen_hashes.append((min_hash,item)) | ||||
|                 duplicates_removed.append(item) | ||||
|             | ||||
| 
 | ||||
|     return duplicates_removed | ||||
| 
 | ||||
| if __name__ == '__main__':     | ||||
|     data_ai = 'qwen'   | ||||
|     root_dir  = rf'./{data_ai}/' | ||||
|     dedup_output_dir = os.path.join(root_dir,'dedup') | ||||
|     if not os.path.exists(dedup_output_dir): | ||||
|         os.mkdir(dedup_output_dir) | ||||
|     if not os.path.exists(root_dir): | ||||
|         logger.error(f"folder {root_dir} not exist" ) | ||||
|          | ||||
|     else:     | ||||
|         for file in os.listdir(root_dir): | ||||
|             file_path = os.path.join(root_dir, file) | ||||
|             if os.path.isfile(file_path): | ||||
|                 print(f'file name: {file_path}') | ||||
|                 if is_json_file(file_path): | ||||
|                     with open(file_path, 'r', encoding='utf-8') as f: | ||||
|                         data = json.load(f) | ||||
|                         dedup_data = deduplicate_json(data)                    | ||||
|                     with open(os.path.join(root_dir, 'dedup','dedup_' + file), 'w', encoding='utf-8') as output_file: | ||||
|                         json.dump(dedup_data, output_file, ensure_ascii=False, indent=4) | ||||
|                  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 zealot52099
						zealot52099