import json
from loguru import logger
import os
from datasketch import MinHash
from hashlib import md5

def is_json_file(filename):
    return filename.endswith('.json')

# 绝对匹配
def is_duplicate_absolutely(d1, d2):
    return md5(d1.encode('utf-8')).hexdigest() == md5(d2.encode('utf-8')).hexdigest()

# 使用MinHash生成器计算dict的签名
def hash_dict(dict_obj):
    m = MinHash()
    for key, value in sorted(dict_obj.items()):
        # 对于非str类型值需要先转为str
        m.update(str(value).encode('utf8'))
    return m

# 使用绝对匹配和MinHash对dict列表去重
def deduplicate_json(data_list, threshold=0.8):
    seen_hashes = []
    duplicates_removed = []

    for item in data_list:
        # print(item)
        # print('###########')
        min_hash = hash_dict(item)
        # print(f'min_hash: {min_hash}')

        # 绝对匹配去重
        if not any(is_duplicate_absolutely(str(item), str(existing)) for existing in duplicates_removed):
            # MinHash相似性去重 
            has_similar = False
            for stored_min_hash, stored_text in seen_hashes:
                if stored_min_hash.jaccard(min_hash) > threshold:
                    has_similar = True
                    break
            if not has_similar:
                seen_hashes.append((min_hash,item))
                duplicates_removed.append(item)
           

    return duplicates_removed

if __name__ == '__main__':    
    data_ai = 'qwen'  
    root_dir  = rf'./{data_ai}/'
    dedup_output_dir = os.path.join(root_dir,'dedup')
    if not os.path.exists(dedup_output_dir):
        os.mkdir(dedup_output_dir)
    if not os.path.exists(root_dir):
        logger.error(f"folder {root_dir} not exist" )
        
    else:    
        for file in os.listdir(root_dir):
            file_path = os.path.join(root_dir, file)
            if os.path.isfile(file_path):
                print(f'file name: {file_path}')
                if is_json_file(file_path):
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        dedup_data = deduplicate_json(data)                   
                    with open(os.path.join(root_dir, 'dedup','dedup_' + file), 'w', encoding='utf-8') as output_file:
                        json.dump(dedup_data, output_file, ensure_ascii=False, indent=4)