OliveSensorAPI/IOTLLM/generate_data/EC_process/Sensor_QA.py

# -*- coding: utf-8 -*-
import json
import random

# 定义生成5000条数据集的函数
def generate_dataset(num_samples=5000):
    dataset = []
    invoke_types = [1, 2, 3]
    area_codes = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
    parameters = [
        {"name": "土壤湿度", "unit": "%", "min": 10, "max": 100},
        {"name": "土壤温度", "unit": "℃", "min": 5, "max": 40},
        {"name": "空气温度", "unit": "℃", "min": -10, "max": 45},
        {"name": "电导率", "unit": "mS/cm", "min": 0.1, "max": 5.0}
    ]

    for _ in range(num_samples):
        invoke_type = random.choice(invoke_types)
        area_code = random.choice(area_codes)
        parameter = random.choice(parameters)

        if isinstance(parameter["min"], int):
            value = round(random.uniform(parameter["min"], parameter["max"]), 1)
        else:
            value = round(random.uniform(parameter["min"], parameter["max"]), 1)

        # 增加多变的提问方式，使数据更自然化
        instruction_templates = [
            f"现在{area_code}种植区内{parameter['name']}如何？",
            f"请告诉我{area_code}区的{parameter['name']}情况。",
            f"{area_code}区当前的{parameter['name']}是多少？",
            f"我想知道{area_code}区的{parameter['name']}。",
            f"{area_code}区的{parameter['name']}现在是多少？",
            f"{area_code}种植区目前的{parameter['name']}是多少？",
            f"能提供{area_code}区的{parameter['name']}数据吗？",
            f"{area_code}种植区的{parameter['name']}是多少？",
            f"请查询{area_code}区的{parameter['name']}。",
            f"{area_code}区现在的{parameter['name']}数据是多少？",
            f"帮我看看{area_code}区{parameter['name']}的情况。",
            f"{area_code}区的{parameter['name']}值是多少？",
            f"帮我查一下{area_code}区的{parameter['name']}。",
            f"{area_code}区的{parameter['name']}现在什么情况？",
            f"请帮我查一下{area_code}种植区的{parameter['name']}是多少？",
            f"我需要知道{area_code}区的{parameter['name']}数据。",
            f"请问{area_code}区的{parameter['name']}如何？",
            f"帮我查询{area_code}区的{parameter['name']}情况。",
            f"现在{area_code}区的{parameter['name']}值是多少？"
        ]
        instruction = random.choice(instruction_templates)
        output = f"{area_code}区现在{parameter['name']}{value}{parameter['unit']}"

        data = {
            "instruction": instruction,
            "invokeType": str(invoke_type),
            "areaCode": area_code,
            "output": output
        }
        dataset.append(data)

    return dataset

# 生成数据并保存为json文件
if __name__ == '__main__':
    dataset = generate_dataset()
    output_file = 'output/synthetic_dataset.json'

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

    print(f"已生成 {output_file} 文件，包含{len(dataset)}条数据。")