71 lines
3.1 KiB
Python
71 lines
3.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
import json
|
|
import random
|
|
|
|
# 定义生成5000条数据集的函数
|
|
def generate_dataset(num_samples=5000):
|
|
dataset = []
|
|
invoke_types = [1, 2, 3]
|
|
area_codes = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
|
|
parameters = [
|
|
{"name": "土壤湿度", "unit": "%", "min": 10, "max": 100},
|
|
{"name": "土壤温度", "unit": "℃", "min": 5, "max": 40},
|
|
{"name": "空气温度", "unit": "℃", "min": -10, "max": 45},
|
|
{"name": "电导率", "unit": "mS/cm", "min": 0.1, "max": 5.0}
|
|
]
|
|
|
|
for _ in range(num_samples):
|
|
invoke_type = random.choice(invoke_types)
|
|
area_code = random.choice(area_codes)
|
|
parameter = random.choice(parameters)
|
|
|
|
if isinstance(parameter["min"], int):
|
|
value = round(random.uniform(parameter["min"], parameter["max"]), 1)
|
|
else:
|
|
value = round(random.uniform(parameter["min"], parameter["max"]), 1)
|
|
|
|
# 增加多变的提问方式,使数据更自然化
|
|
instruction_templates = [
|
|
f"现在{area_code}种植区内{parameter['name']}如何?",
|
|
f"请告诉我{area_code}区的{parameter['name']}情况。",
|
|
f"{area_code}区当前的{parameter['name']}是多少?",
|
|
f"我想知道{area_code}区的{parameter['name']}。",
|
|
f"{area_code}区的{parameter['name']}现在是多少?",
|
|
f"{area_code}种植区目前的{parameter['name']}是多少?",
|
|
f"能提供{area_code}区的{parameter['name']}数据吗?",
|
|
f"{area_code}种植区的{parameter['name']}是多少?",
|
|
f"请查询{area_code}区的{parameter['name']}。",
|
|
f"{area_code}区现在的{parameter['name']}数据是多少?",
|
|
f"帮我看看{area_code}区{parameter['name']}的情况。",
|
|
f"{area_code}区的{parameter['name']}值是多少?",
|
|
f"帮我查一下{area_code}区的{parameter['name']}。",
|
|
f"{area_code}区的{parameter['name']}现在什么情况?",
|
|
f"请帮我查一下{area_code}种植区的{parameter['name']}是多少?",
|
|
f"我需要知道{area_code}区的{parameter['name']}数据。",
|
|
f"请问{area_code}区的{parameter['name']}如何?",
|
|
f"帮我查询{area_code}区的{parameter['name']}情况。",
|
|
f"现在{area_code}区的{parameter['name']}值是多少?"
|
|
]
|
|
instruction = random.choice(instruction_templates)
|
|
output = f"{area_code}区现在{parameter['name']}{value}{parameter['unit']}"
|
|
|
|
data = {
|
|
"instruction": instruction,
|
|
"invokeType": str(invoke_type),
|
|
"areaCode": area_code,
|
|
"output": output
|
|
}
|
|
dataset.append(data)
|
|
|
|
return dataset
|
|
|
|
# 生成数据并保存为json文件
|
|
if __name__ == '__main__':
|
|
dataset = generate_dataset()
|
|
output_file = 'output/synthetic_dataset.json'
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(dataset, f, ensure_ascii=False, indent=4)
|
|
|
|
print(f"已生成 {output_file} 文件,包含{len(dataset)}条数据。")
|