# -*- coding: utf-8 -*- import json import random # 定义生成5000条数据集的函数 def generate_dataset(num_samples=5000): dataset = [] invoke_types = [1, 2, 3] area_codes = [chr(i) for i in range(ord('A'), ord('Z') + 1)] parameters = [ {"name": "土壤湿度", "unit": "%", "min": 10, "max": 100}, {"name": "土壤温度", "unit": "℃", "min": 5, "max": 40}, {"name": "空气温度", "unit": "℃", "min": -10, "max": 45}, {"name": "电导率", "unit": "mS/cm", "min": 0.1, "max": 5.0} ] for _ in range(num_samples): invoke_type = random.choice(invoke_types) area_code = random.choice(area_codes) parameter = random.choice(parameters) if isinstance(parameter["min"], int): value = round(random.uniform(parameter["min"], parameter["max"]), 1) else: value = round(random.uniform(parameter["min"], parameter["max"]), 1) # 增加多变的提问方式,使数据更自然化 instruction_templates = [ f"现在{area_code}种植区内{parameter['name']}如何?", f"请告诉我{area_code}区的{parameter['name']}情况。", f"{area_code}区当前的{parameter['name']}是多少?", f"我想知道{area_code}区的{parameter['name']}。", f"{area_code}区的{parameter['name']}现在是多少?", f"{area_code}种植区目前的{parameter['name']}是多少?", f"能提供{area_code}区的{parameter['name']}数据吗?", f"{area_code}种植区的{parameter['name']}是多少?", f"请查询{area_code}区的{parameter['name']}。", f"{area_code}区现在的{parameter['name']}数据是多少?", f"帮我看看{area_code}区{parameter['name']}的情况。", f"{area_code}区的{parameter['name']}值是多少?", f"帮我查一下{area_code}区的{parameter['name']}。", f"{area_code}区的{parameter['name']}现在什么情况?", f"请帮我查一下{area_code}种植区的{parameter['name']}是多少?", f"我需要知道{area_code}区的{parameter['name']}数据。", f"请问{area_code}区的{parameter['name']}如何?", f"帮我查询{area_code}区的{parameter['name']}情况。", f"现在{area_code}区的{parameter['name']}值是多少?" ] instruction = random.choice(instruction_templates) output = f"{area_code}区现在{parameter['name']}{value}{parameter['unit']}" data = { "instruction": instruction, "invokeType": str(invoke_type), "areaCode": area_code, "output": output } dataset.append(data) return dataset # 生成数据并保存为json文件 if __name__ == '__main__': dataset = generate_dataset() output_file = 'output/synthetic_dataset.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(dataset, f, ensure_ascii=False, indent=4) print(f"已生成 {output_file} 文件,包含{len(dataset)}条数据。")