OliveSensorAPI/IOTLLM/generate_data/EC_process/Sensor_QA.py

71 lines
3.1 KiB
Python

# -*- coding: utf-8 -*-
import json
import random
# 定义生成5000条数据集的函数
def generate_dataset(num_samples=5000):
dataset = []
invoke_types = [1, 2, 3]
area_codes = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
parameters = [
{"name": "土壤湿度", "unit": "%", "min": 10, "max": 100},
{"name": "土壤温度", "unit": "", "min": 5, "max": 40},
{"name": "空气温度", "unit": "", "min": -10, "max": 45},
{"name": "电导率", "unit": "mS/cm", "min": 0.1, "max": 5.0}
]
for _ in range(num_samples):
invoke_type = random.choice(invoke_types)
area_code = random.choice(area_codes)
parameter = random.choice(parameters)
if isinstance(parameter["min"], int):
value = round(random.uniform(parameter["min"], parameter["max"]), 1)
else:
value = round(random.uniform(parameter["min"], parameter["max"]), 1)
# 增加多变的提问方式,使数据更自然化
instruction_templates = [
f"现在{area_code}种植区内{parameter['name']}如何?",
f"请告诉我{area_code}区的{parameter['name']}情况。",
f"{area_code}区当前的{parameter['name']}是多少?",
f"我想知道{area_code}区的{parameter['name']}",
f"{area_code}区的{parameter['name']}现在是多少?",
f"{area_code}种植区目前的{parameter['name']}是多少?",
f"能提供{area_code}区的{parameter['name']}数据吗?",
f"{area_code}种植区的{parameter['name']}是多少?",
f"请查询{area_code}区的{parameter['name']}",
f"{area_code}区现在的{parameter['name']}数据是多少?",
f"帮我看看{area_code}{parameter['name']}的情况。",
f"{area_code}区的{parameter['name']}值是多少?",
f"帮我查一下{area_code}区的{parameter['name']}",
f"{area_code}区的{parameter['name']}现在什么情况?",
f"请帮我查一下{area_code}种植区的{parameter['name']}是多少?",
f"我需要知道{area_code}区的{parameter['name']}数据。",
f"请问{area_code}区的{parameter['name']}如何?",
f"帮我查询{area_code}区的{parameter['name']}情况。",
f"现在{area_code}区的{parameter['name']}值是多少?"
]
instruction = random.choice(instruction_templates)
output = f"{area_code}区现在{parameter['name']}{value}{parameter['unit']}"
data = {
"instruction": instruction,
"invokeType": str(invoke_type),
"areaCode": area_code,
"output": output
}
dataset.append(data)
return dataset
# 生成数据并保存为json文件
if __name__ == '__main__':
dataset = generate_dataset()
output_file = 'output/synthetic_dataset.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=4)
print(f"已生成 {output_file} 文件,包含{len(dataset)}条数据。")