自定义数据集处理脚本

This commit is contained in:
黄子寒 2024-12-10 23:37:45 +08:00
parent 1125b67f50
commit 1ff833393d
62 changed files with 416 additions and 13 deletions

View File

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/12 16:37
# @Author : 黄子寒
# @Email : 1064071566@qq.com
# @File : script.py
# @Project : EmoLLM
import requests
def query_sensor_data(area_code: str, parameter_type: str) -> dict:
"""
查询指定区域和数据类型的传感器数据
:param area_code: str, 区域代码 ( "A" "B")
:param parameter_type: str, 数据类型 ( "moisture""temperature""conductivity")
:return: dict, 包含查询结果的字典
"""
url = f"http://127.0.0.1:8000/sensors/api/{area_code}/{parameter_type}/"
try:
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json() # 返回 JSON 格式的数据
except requests.exceptions.RequestException as e:
return {"error": str(e)}
# 示例用法:可供 LLM function calling 调用
if __name__ == "__main__":
import sys
# 从命令行获取参数
if len(sys.argv) < 3:
print("请提供区域代码和数据类型参数")
else:
area_code = sys.argv[1]
parameter_type = sys.argv[2]
result = query_sensor_data(area_code, parameter_type)
print(result)

View File

View File

@ -0,0 +1,16 @@
"""
ASGI config for SensorAPI project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "SensorAPI.settings")
application = get_asgi_application()

View File

@ -0,0 +1,124 @@
"""
Django settings for SensorAPI project.
Generated by 'django-admin startproject' using Django 5.1.2.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = "django-insecure-0+#$1#@+&8$+y#f%0q!^kcz-+5&nkqhaluu*3mv8fa9t793u=z"
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
"django.contrib.admin",
"django.contrib.auth",
"django.contrib.contenttypes",
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
'sensors',
]
MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"django.middleware.clickjacking.XFrameOptionsMiddleware",
]
ROOT_URLCONF = "SensorAPI.urls"
TEMPLATES = [
{
"BACKEND": "django.template.backends.django.DjangoTemplates",
"DIRS": [],
"APP_DIRS": True,
"OPTIONS": {
"context_processors": [
"django.template.context_processors.debug",
"django.template.context_processors.request",
"django.contrib.auth.context_processors.auth",
"django.contrib.messages.context_processors.messages",
],
},
},
]
WSGI_APPLICATION = "SensorAPI.wsgi.application"
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": BASE_DIR / "db.sqlite3",
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
},
{
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
},
{
"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
},
{
"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.1/topics/i18n/
LANGUAGE_CODE = "en-us"
TIME_ZONE = "UTC"
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = "static/"
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

View File

@ -0,0 +1,25 @@
"""
URL configuration for SensorAPI project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/5.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path
from django.urls import include, path
urlpatterns = [
path('sensors/', include('sensors.urls')),
]

View File

@ -0,0 +1,16 @@
"""
WSGI config for SensorAPI project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "SensorAPI.settings")
application = get_wsgi_application()

View File

@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/11 19:25
# @Author : 黄子寒
# @Email : 1064071566@qq.com
# @File : __init__.py.py
# @Project : EmoLLM

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "SensorAPI.settings")
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == "__main__":
main()

View File

View File

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

View File

@ -0,0 +1,6 @@
from django.apps import AppConfig
class SensorsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "sensors"

View File

@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/11 20:03
# @Author : 黄子寒
# @Email : 1064071566@qq.com
# @File : __init__.py.py
# @Project : EmoLLM

View File

@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/11 20:03
# @Author : 黄子寒
# @Email : 1064071566@qq.com
# @File : __init__.py.py
# @Project : EmoLLM

View File

@ -0,0 +1,32 @@
# sensors/management/commands/populate_data.py
import random
from django.core.management.base import BaseCommand
from sensors.models import Sensor, SensorData
from django.utils import timezone
class Command(BaseCommand):
help = "Populate database with sample sensor data for areas A to Z"
def handle(self, *args, **kwargs):
Sensor.objects.all().delete()
SensorData.objects.all().delete()
areas = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
parameter_types = ['moisture', 'temperature', 'conductivity']
for area in areas:
num_sensors = random.randint(2, 5) # 每个区域 2 到 5 个传感器
for _ in range(num_sensors):
sensor = Sensor.objects.create(area_code=area)
# 为每个传感器创建参数数据
for param in parameter_types:
value = round(random.uniform(0.1, 100.0), 2)
SensorData.objects.create(
sensor=sensor,
parameter_type=param,
value=value,
timestamp=timezone.now()
)
self.stdout.write(self.style.SUCCESS('Successfully populated sample data for areas A to Z'))

View File

@ -0,0 +1,32 @@
# Generated by Django 5.1.2 on 2024-11-11 12:10
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="SensorData",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("area_code", models.CharField(max_length=10)),
("moisture", models.DecimalField(decimal_places=2, max_digits=5)),
("temperature", models.DecimalField(decimal_places=2, max_digits=5)),
("conductivity", models.DecimalField(decimal_places=2, max_digits=5)),
("timestamp", models.DateTimeField(auto_now_add=True)),
],
),
]

View File

@ -0,0 +1,18 @@
from django.db import models
import uuid
class Sensor(models.Model):
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
area_code = models.CharField(max_length=10)
def __str__(self):
return f"Sensor {self.uuid} in Area {self.area_code}"
class SensorData(models.Model):
sensor = models.ForeignKey(Sensor, on_delete=models.CASCADE, related_name='data', null=True) # 暂时允许 null
parameter_type = models.CharField(max_length=20, default='moisture') # 提供默认值
value = models.DecimalField(max_digits=10, decimal_places=2)
timestamp = models.DateTimeField(auto_now_add=True)
def __str__(self):
return f"Data from Sensor {self.sensor.uuid if self.sensor else 'N/A'} at {self.timestamp}"

View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View File

@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/11 18:56
# @Author : 黄子寒
# @Email : 1064071566@qq.com
# @File : urls.py
# @Project : EmoLLM
from django.urls import path
from .views import get_sensor_data
urlpatterns = [
path('api/<str:area_code>/<str:parameter_type>/', get_sensor_data, name='get_sensor_data'),
]

View File

@ -0,0 +1,36 @@
# sensors/views.py
from django.http import JsonResponse
from .models import Sensor
def get_sensor_data(request, area_code, parameter_type):
try:
sensors = Sensor.objects.filter(area_code=area_code)
if not sensors.exists():
return JsonResponse({'error': 'No sensors found for this area code'}, status=404)
data_list = []
for sensor in sensors:
# 获取该传感器最新的指定参数的数据
data = sensor.data.filter(parameter_type=parameter_type).order_by('-timestamp').first()
if data:
data_list.append({
'sensor_uuid': str(sensor.uuid),
'value': float(data.value),
'timestamp': data.timestamp.isoformat()
})
if not data_list:
return JsonResponse({'error': f'No {parameter_type} data available for this area'}, status=404)
response = {
'area_code': area_code,
'parameter_type': parameter_type,
'sensors': data_list
}
return JsonResponse(response)
except Exception as e:
return JsonResponse({'error': str(e)}, status=500)

View File

@ -30,7 +30,7 @@ with open("../processPDF/cleaned_data.txt", "r", encoding="utf-8") as f:
cleaned_text = f.read()
# 自定义分割函数,按最大100字以内的句子段落
# 自定义分割函数,按最大300字以内的句子段落
def split_text_to_sentences(text, max_length=300):
sentences = re.split('(?<=。)', text)
grouped_sentences = []
@ -100,7 +100,7 @@ def parse_multiple_qa(answer_text):
# 迭代限制防止API额度过大
def checklen(text):
while len(text) > 8000: # 限制在8000字符以内
while len(text) > 80000:
del text[0]
return text

View File

@ -45,6 +45,6 @@ for img_path in image_list:
f.write(f"{word}\n")
print(f"Word: {word}, Confidence: {confidence}")
print(f" {word}, C: {confidence}")
print(f"{txt_file_path}")

View File

@ -33,7 +33,7 @@ for page_number in range(len(pdf_document)):
image_path = os.path.join(output_image_dir, f"{page_number + 1}.png")
pix.save(image_path)
print(f"Saved {image_path}")
print(f" {image_path}")
pdf_document.close()

View File

@ -21,7 +21,7 @@ def load_qa_data(file_path):
# 文本预处理
def preprocess_text(text):
stop_words = set(stopwords.words('english'))
stop_words = set(stopwords.words('chinese'))
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
return tokens

View File

@ -39,7 +39,7 @@ def check(filepath):
if __name__ == '__main__':
dir_path = '.'
dir_path = ''
paths = get_all_file_paths(dir_path, suffix='.json')
for path in paths:
print(check(filepath=path))

View File

@ -23,7 +23,7 @@ def get_all_file_paths(folder_path, file_type='.jsonl'):
if __name__ == '__main__':
conversion_lis = []
folder_path = r'./' # python merge_jsonl.py > curr.txt
folder_path = r'/' # python merge_jsonl.py > curr.txt
merge_path = folder_path.split('/')[-1]
try:

View File

@ -54,8 +54,8 @@ if __name__ == '__main__':
for j in tqdm(range(10)):
Input = prompt(i)
question = checklen(getText("user",Input))
SparkApi.answer =""
SparkApi.main(appid,api_key, api_secret, Spark_url, domain, question)
SparkApi.answer = ""
SparkApi.main(appid, api_key, api_secret, Spark_url, domain, question)
getText("assistant", SparkApi.answer)
conversations.append(xinghuo_api(SparkApi.answer))
if i % 2 == 0 :

View File

@ -234,7 +234,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
### 📌数据构建
- 请阅读[数据构建指南](generate_data/tutorial.md)查阅
- 请阅读[数据构建指南](IOTLLM/generate_data/tutorial.md)查阅
- 微调用到的数据集见[datasets](datasets/data.json)
### 🎨增量预训练、微调指南

View File

@ -235,7 +235,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
- Quick coding: [Baby EmoLLM](quick_start/Baby_EmoLLM.ipynb)
### 📌Data Construction
- Please read the [Data Construction Guide ](generate_data/tutorial_EN.md) for reference.
- Please read the [Data Construction Guide ](IOTLLM/generate_data/tutorial_EN.md) for reference.
- The dataset used for this fine-tuning can be found at [datasets](datasets/data.json)
### 🎨Incremental Pre-training and Fine-tuning Guide

View File

@ -225,7 +225,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
### 📌データ構築
- [データ構築ガイド](generate_data/tutorial_EN.md)を参照してください。
- [データ構築ガイド](IOTLLM/generate_data/tutorial_EN.md)を参照してください。
- この微調整に使用されたデータセットは[datasets](datasets/data.json)にあります。

View File

@ -35,4 +35,7 @@ websocket~=0.2.1
websocket-client~=1.6.2
gensim~=4.3.3
pillow~=9.5.0
natsort~=8.4.0
natsort~=8.4.0
jsonlines~=4.0.0
django~=5.1.2
scikit-learn~=1.3.2