自定义数据集处理脚本
This commit is contained in:
parent
1125b67f50
commit
1ff833393d
37
IOTLLM/SensorAPI/LLMscript/script.py
Normal file
37
IOTLLM/SensorAPI/LLMscript/script.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2024/11/12 16:37
|
||||||
|
# @Author : 黄子寒
|
||||||
|
# @Email : 1064071566@qq.com
|
||||||
|
# @File : script.py
|
||||||
|
# @Project : EmoLLM
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def query_sensor_data(area_code: str, parameter_type: str) -> dict:
|
||||||
|
"""
|
||||||
|
查询指定区域和数据类型的传感器数据。
|
||||||
|
:param area_code: str, 区域代码 (如 "A" 或 "B")
|
||||||
|
:param parameter_type: str, 数据类型 (如 "moisture"、"temperature"、"conductivity")
|
||||||
|
:return: dict, 包含查询结果的字典
|
||||||
|
"""
|
||||||
|
url = f"http://127.0.0.1:8000/sensors/api/{area_code}/{parameter_type}/"
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status() # 检查请求是否成功
|
||||||
|
return response.json() # 返回 JSON 格式的数据
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
# 示例用法:可供 LLM function calling 调用
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# 从命令行获取参数
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("请提供区域代码和数据类型参数")
|
||||||
|
else:
|
||||||
|
area_code = sys.argv[1]
|
||||||
|
parameter_type = sys.argv[2]
|
||||||
|
result = query_sensor_data(area_code, parameter_type)
|
||||||
|
print(result)
|
0
IOTLLM/SensorAPI/SensorAPI/__init__.py
Normal file
0
IOTLLM/SensorAPI/SensorAPI/__init__.py
Normal file
16
IOTLLM/SensorAPI/SensorAPI/asgi.py
Normal file
16
IOTLLM/SensorAPI/SensorAPI/asgi.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
ASGI config for SensorAPI project.
|
||||||
|
|
||||||
|
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.asgi import get_asgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "SensorAPI.settings")
|
||||||
|
|
||||||
|
application = get_asgi_application()
|
124
IOTLLM/SensorAPI/SensorAPI/settings.py
Normal file
124
IOTLLM/SensorAPI/SensorAPI/settings.py
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
"""
|
||||||
|
Django settings for SensorAPI project.
|
||||||
|
|
||||||
|
Generated by 'django-admin startproject' using Django 5.1.2.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/topics/settings/
|
||||||
|
|
||||||
|
For the full list of settings and their values, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
# Quick-start development settings - unsuitable for production
|
||||||
|
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
||||||
|
|
||||||
|
# SECURITY WARNING: keep the secret key used in production secret!
|
||||||
|
SECRET_KEY = "django-insecure-0+#$1#@+&8$+y#f%0q!^kcz-+5&nkqhaluu*3mv8fa9t793u=z"
|
||||||
|
|
||||||
|
# SECURITY WARNING: don't run with debug turned on in production!
|
||||||
|
DEBUG = True
|
||||||
|
|
||||||
|
ALLOWED_HOSTS = []
|
||||||
|
|
||||||
|
|
||||||
|
# Application definition
|
||||||
|
|
||||||
|
INSTALLED_APPS = [
|
||||||
|
"django.contrib.admin",
|
||||||
|
"django.contrib.auth",
|
||||||
|
"django.contrib.contenttypes",
|
||||||
|
"django.contrib.sessions",
|
||||||
|
"django.contrib.messages",
|
||||||
|
"django.contrib.staticfiles",
|
||||||
|
'sensors',
|
||||||
|
]
|
||||||
|
|
||||||
|
MIDDLEWARE = [
|
||||||
|
"django.middleware.security.SecurityMiddleware",
|
||||||
|
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||||
|
"django.middleware.common.CommonMiddleware",
|
||||||
|
"django.middleware.csrf.CsrfViewMiddleware",
|
||||||
|
"django.contrib.auth.middleware.AuthenticationMiddleware",
|
||||||
|
"django.contrib.messages.middleware.MessageMiddleware",
|
||||||
|
"django.middleware.clickjacking.XFrameOptionsMiddleware",
|
||||||
|
]
|
||||||
|
|
||||||
|
ROOT_URLCONF = "SensorAPI.urls"
|
||||||
|
|
||||||
|
TEMPLATES = [
|
||||||
|
{
|
||||||
|
"BACKEND": "django.template.backends.django.DjangoTemplates",
|
||||||
|
"DIRS": [],
|
||||||
|
"APP_DIRS": True,
|
||||||
|
"OPTIONS": {
|
||||||
|
"context_processors": [
|
||||||
|
"django.template.context_processors.debug",
|
||||||
|
"django.template.context_processors.request",
|
||||||
|
"django.contrib.auth.context_processors.auth",
|
||||||
|
"django.contrib.messages.context_processors.messages",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
WSGI_APPLICATION = "SensorAPI.wsgi.application"
|
||||||
|
|
||||||
|
|
||||||
|
# Database
|
||||||
|
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
||||||
|
|
||||||
|
DATABASES = {
|
||||||
|
"default": {
|
||||||
|
"ENGINE": "django.db.backends.sqlite3",
|
||||||
|
"NAME": BASE_DIR / "db.sqlite3",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Password validation
|
||||||
|
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
||||||
|
|
||||||
|
AUTH_PASSWORD_VALIDATORS = [
|
||||||
|
{
|
||||||
|
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Internationalization
|
||||||
|
# https://docs.djangoproject.com/en/5.1/topics/i18n/
|
||||||
|
|
||||||
|
LANGUAGE_CODE = "en-us"
|
||||||
|
|
||||||
|
TIME_ZONE = "UTC"
|
||||||
|
|
||||||
|
USE_I18N = True
|
||||||
|
|
||||||
|
USE_TZ = True
|
||||||
|
|
||||||
|
|
||||||
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
||||||
|
|
||||||
|
STATIC_URL = "static/"
|
||||||
|
|
||||||
|
# Default primary key field type
|
||||||
|
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||||
|
|
||||||
|
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
|
25
IOTLLM/SensorAPI/SensorAPI/urls.py
Normal file
25
IOTLLM/SensorAPI/SensorAPI/urls.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
"""
|
||||||
|
URL configuration for SensorAPI project.
|
||||||
|
|
||||||
|
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||||
|
https://docs.djangoproject.com/en/5.1/topics/http/urls/
|
||||||
|
Examples:
|
||||||
|
Function views
|
||||||
|
1. Add an import: from my_app import views
|
||||||
|
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||||
|
Class-based views
|
||||||
|
1. Add an import: from other_app.views import Home
|
||||||
|
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||||
|
Including another URLconf
|
||||||
|
1. Import the include() function: from django.urls import include, path
|
||||||
|
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||||
|
"""
|
||||||
|
|
||||||
|
from django.contrib import admin
|
||||||
|
from django.urls import path
|
||||||
|
|
||||||
|
from django.urls import include, path
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('sensors/', include('sensors.urls')),
|
||||||
|
]
|
16
IOTLLM/SensorAPI/SensorAPI/wsgi.py
Normal file
16
IOTLLM/SensorAPI/SensorAPI/wsgi.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
WSGI config for SensorAPI project.
|
||||||
|
|
||||||
|
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.wsgi import get_wsgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "SensorAPI.settings")
|
||||||
|
|
||||||
|
application = get_wsgi_application()
|
6
IOTLLM/SensorAPI/__init__.py
Normal file
6
IOTLLM/SensorAPI/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2024/11/11 19:25
|
||||||
|
# @Author : 黄子寒
|
||||||
|
# @Email : 1064071566@qq.com
|
||||||
|
# @File : __init__.py.py
|
||||||
|
# @Project : EmoLLM
|
22
IOTLLM/SensorAPI/manage.py
Normal file
22
IOTLLM/SensorAPI/manage.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""Django's command-line utility for administrative tasks."""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run administrative tasks."""
|
||||||
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "SensorAPI.settings")
|
||||||
|
try:
|
||||||
|
from django.core.management import execute_from_command_line
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"Couldn't import Django. Are you sure it's installed and "
|
||||||
|
"available on your PYTHONPATH environment variable? Did you "
|
||||||
|
"forget to activate a virtual environment?"
|
||||||
|
) from exc
|
||||||
|
execute_from_command_line(sys.argv)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
0
IOTLLM/SensorAPI/sensors/__init__.py
Normal file
0
IOTLLM/SensorAPI/sensors/__init__.py
Normal file
3
IOTLLM/SensorAPI/sensors/admin.py
Normal file
3
IOTLLM/SensorAPI/sensors/admin.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from django.contrib import admin
|
||||||
|
|
||||||
|
# Register your models here.
|
6
IOTLLM/SensorAPI/sensors/apps.py
Normal file
6
IOTLLM/SensorAPI/sensors/apps.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class SensorsConfig(AppConfig):
|
||||||
|
default_auto_field = "django.db.models.BigAutoField"
|
||||||
|
name = "sensors"
|
6
IOTLLM/SensorAPI/sensors/management/__init__.py
Normal file
6
IOTLLM/SensorAPI/sensors/management/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2024/11/11 20:03
|
||||||
|
# @Author : 黄子寒
|
||||||
|
# @Email : 1064071566@qq.com
|
||||||
|
# @File : __init__.py.py
|
||||||
|
# @Project : EmoLLM
|
6
IOTLLM/SensorAPI/sensors/management/commands/__init__.py
Normal file
6
IOTLLM/SensorAPI/sensors/management/commands/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2024/11/11 20:03
|
||||||
|
# @Author : 黄子寒
|
||||||
|
# @Email : 1064071566@qq.com
|
||||||
|
# @File : __init__.py.py
|
||||||
|
# @Project : EmoLLM
|
@ -0,0 +1,32 @@
|
|||||||
|
# sensors/management/commands/populate_data.py
|
||||||
|
|
||||||
|
import random
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from sensors.models import Sensor, SensorData
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "Populate database with sample sensor data for areas A to Z"
|
||||||
|
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
Sensor.objects.all().delete()
|
||||||
|
SensorData.objects.all().delete()
|
||||||
|
|
||||||
|
areas = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
|
||||||
|
parameter_types = ['moisture', 'temperature', 'conductivity']
|
||||||
|
|
||||||
|
for area in areas:
|
||||||
|
num_sensors = random.randint(2, 5) # 每个区域 2 到 5 个传感器
|
||||||
|
for _ in range(num_sensors):
|
||||||
|
sensor = Sensor.objects.create(area_code=area)
|
||||||
|
# 为每个传感器创建参数数据
|
||||||
|
for param in parameter_types:
|
||||||
|
value = round(random.uniform(0.1, 100.0), 2)
|
||||||
|
SensorData.objects.create(
|
||||||
|
sensor=sensor,
|
||||||
|
parameter_type=param,
|
||||||
|
value=value,
|
||||||
|
timestamp=timezone.now()
|
||||||
|
)
|
||||||
|
|
||||||
|
self.stdout.write(self.style.SUCCESS('Successfully populated sample data for areas A to Z'))
|
32
IOTLLM/SensorAPI/sensors/migrations/0001_initial.py
Normal file
32
IOTLLM/SensorAPI/sensors/migrations/0001_initial.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# Generated by Django 5.1.2 on 2024-11-11 12:10
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = []
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name="SensorData",
|
||||||
|
fields=[
|
||||||
|
(
|
||||||
|
"id",
|
||||||
|
models.BigAutoField(
|
||||||
|
auto_created=True,
|
||||||
|
primary_key=True,
|
||||||
|
serialize=False,
|
||||||
|
verbose_name="ID",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
("area_code", models.CharField(max_length=10)),
|
||||||
|
("moisture", models.DecimalField(decimal_places=2, max_digits=5)),
|
||||||
|
("temperature", models.DecimalField(decimal_places=2, max_digits=5)),
|
||||||
|
("conductivity", models.DecimalField(decimal_places=2, max_digits=5)),
|
||||||
|
("timestamp", models.DateTimeField(auto_now_add=True)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
0
IOTLLM/SensorAPI/sensors/migrations/__init__.py
Normal file
0
IOTLLM/SensorAPI/sensors/migrations/__init__.py
Normal file
18
IOTLLM/SensorAPI/sensors/models.py
Normal file
18
IOTLLM/SensorAPI/sensors/models.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from django.db import models
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
class Sensor(models.Model):
|
||||||
|
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||||
|
area_code = models.CharField(max_length=10)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"Sensor {self.uuid} in Area {self.area_code}"
|
||||||
|
|
||||||
|
class SensorData(models.Model):
|
||||||
|
sensor = models.ForeignKey(Sensor, on_delete=models.CASCADE, related_name='data', null=True) # 暂时允许 null
|
||||||
|
parameter_type = models.CharField(max_length=20, default='moisture') # 提供默认值
|
||||||
|
value = models.DecimalField(max_digits=10, decimal_places=2)
|
||||||
|
timestamp = models.DateTimeField(auto_now_add=True)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"Data from Sensor {self.sensor.uuid if self.sensor else 'N/A'} at {self.timestamp}"
|
3
IOTLLM/SensorAPI/sensors/tests.py
Normal file
3
IOTLLM/SensorAPI/sensors/tests.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
# Create your tests here.
|
12
IOTLLM/SensorAPI/sensors/urls.py
Normal file
12
IOTLLM/SensorAPI/sensors/urls.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2024/11/11 18:56
|
||||||
|
# @Author : 黄子寒
|
||||||
|
# @Email : 1064071566@qq.com
|
||||||
|
# @File : urls.py
|
||||||
|
# @Project : EmoLLM
|
||||||
|
from django.urls import path
|
||||||
|
from .views import get_sensor_data
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('api/<str:area_code>/<str:parameter_type>/', get_sensor_data, name='get_sensor_data'),
|
||||||
|
]
|
36
IOTLLM/SensorAPI/sensors/views.py
Normal file
36
IOTLLM/SensorAPI/sensors/views.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# sensors/views.py
|
||||||
|
|
||||||
|
from django.http import JsonResponse
|
||||||
|
|
||||||
|
from .models import Sensor
|
||||||
|
|
||||||
|
|
||||||
|
def get_sensor_data(request, area_code, parameter_type):
|
||||||
|
try:
|
||||||
|
sensors = Sensor.objects.filter(area_code=area_code)
|
||||||
|
if not sensors.exists():
|
||||||
|
return JsonResponse({'error': 'No sensors found for this area code'}, status=404)
|
||||||
|
|
||||||
|
data_list = []
|
||||||
|
for sensor in sensors:
|
||||||
|
# 获取该传感器最新的指定参数的数据
|
||||||
|
data = sensor.data.filter(parameter_type=parameter_type).order_by('-timestamp').first()
|
||||||
|
if data:
|
||||||
|
data_list.append({
|
||||||
|
'sensor_uuid': str(sensor.uuid),
|
||||||
|
'value': float(data.value),
|
||||||
|
'timestamp': data.timestamp.isoformat()
|
||||||
|
})
|
||||||
|
|
||||||
|
if not data_list:
|
||||||
|
return JsonResponse({'error': f'No {parameter_type} data available for this area'}, status=404)
|
||||||
|
|
||||||
|
response = {
|
||||||
|
'area_code': area_code,
|
||||||
|
'parameter_type': parameter_type,
|
||||||
|
'sensors': data_list
|
||||||
|
}
|
||||||
|
|
||||||
|
return JsonResponse(response)
|
||||||
|
except Exception as e:
|
||||||
|
return JsonResponse({'error': str(e)}, status=500)
|
@ -30,7 +30,7 @@ with open("../processPDF/cleaned_data.txt", "r", encoding="utf-8") as f:
|
|||||||
cleaned_text = f.read()
|
cleaned_text = f.read()
|
||||||
|
|
||||||
|
|
||||||
# 自定义分割函数,按最大100字以内的句子段落
|
# 自定义分割函数,按最大300字以内的句子段落
|
||||||
def split_text_to_sentences(text, max_length=300):
|
def split_text_to_sentences(text, max_length=300):
|
||||||
sentences = re.split('(?<=。)', text)
|
sentences = re.split('(?<=。)', text)
|
||||||
grouped_sentences = []
|
grouped_sentences = []
|
||||||
@ -100,7 +100,7 @@ def parse_multiple_qa(answer_text):
|
|||||||
|
|
||||||
# 迭代限制,防止API额度过大
|
# 迭代限制,防止API额度过大
|
||||||
def checklen(text):
|
def checklen(text):
|
||||||
while len(text) > 8000: # 限制在8000字符以内
|
while len(text) > 80000:
|
||||||
del text[0]
|
del text[0]
|
||||||
return text
|
return text
|
||||||
|
|
@ -45,6 +45,6 @@ for img_path in image_list:
|
|||||||
|
|
||||||
f.write(f"{word}\n")
|
f.write(f"{word}\n")
|
||||||
|
|
||||||
print(f"Word: {word}, Confidence: {confidence}")
|
print(f" {word}, C: {confidence}")
|
||||||
|
|
||||||
print(f"{txt_file_path}")
|
print(f"{txt_file_path}")
|
@ -33,7 +33,7 @@ for page_number in range(len(pdf_document)):
|
|||||||
image_path = os.path.join(output_image_dir, f"{page_number + 1}.png")
|
image_path = os.path.join(output_image_dir, f"{page_number + 1}.png")
|
||||||
pix.save(image_path)
|
pix.save(image_path)
|
||||||
|
|
||||||
print(f"Saved {image_path}")
|
print(f" {image_path}")
|
||||||
|
|
||||||
|
|
||||||
pdf_document.close()
|
pdf_document.close()
|
@ -21,7 +21,7 @@ def load_qa_data(file_path):
|
|||||||
|
|
||||||
# 文本预处理
|
# 文本预处理
|
||||||
def preprocess_text(text):
|
def preprocess_text(text):
|
||||||
stop_words = set(stopwords.words('english'))
|
stop_words = set(stopwords.words('chinese'))
|
||||||
tokens = word_tokenize(text.lower())
|
tokens = word_tokenize(text.lower())
|
||||||
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
|
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
|
||||||
return tokens
|
return tokens
|
@ -39,7 +39,7 @@ def check(filepath):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
dir_path = '.'
|
dir_path = ''
|
||||||
paths = get_all_file_paths(dir_path, suffix='.json')
|
paths = get_all_file_paths(dir_path, suffix='.json')
|
||||||
for path in paths:
|
for path in paths:
|
||||||
print(check(filepath=path))
|
print(check(filepath=path))
|
@ -23,7 +23,7 @@ def get_all_file_paths(folder_path, file_type='.jsonl'):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
conversion_lis = []
|
conversion_lis = []
|
||||||
|
|
||||||
folder_path = r'./' # python merge_jsonl.py > curr.txt
|
folder_path = r'/' # python merge_jsonl.py > curr.txt
|
||||||
|
|
||||||
merge_path = folder_path.split('/')[-1]
|
merge_path = folder_path.split('/')[-1]
|
||||||
try:
|
try:
|
@ -54,8 +54,8 @@ if __name__ == '__main__':
|
|||||||
for j in tqdm(range(10)):
|
for j in tqdm(range(10)):
|
||||||
Input = prompt(i)
|
Input = prompt(i)
|
||||||
question = checklen(getText("user",Input))
|
question = checklen(getText("user",Input))
|
||||||
SparkApi.answer =""
|
SparkApi.answer = ""
|
||||||
SparkApi.main(appid,api_key, api_secret, Spark_url, domain, question)
|
SparkApi.main(appid, api_key, api_secret, Spark_url, domain, question)
|
||||||
getText("assistant", SparkApi.answer)
|
getText("assistant", SparkApi.answer)
|
||||||
conversations.append(xinghuo_api(SparkApi.answer))
|
conversations.append(xinghuo_api(SparkApi.answer))
|
||||||
if i % 2 == 0 :
|
if i % 2 == 0 :
|
@ -234,7 +234,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
|
|||||||
|
|
||||||
|
|
||||||
### 📌数据构建
|
### 📌数据构建
|
||||||
- 请阅读[数据构建指南](generate_data/tutorial.md)查阅
|
- 请阅读[数据构建指南](IOTLLM/generate_data/tutorial.md)查阅
|
||||||
- 微调用到的数据集见[datasets](datasets/data.json)
|
- 微调用到的数据集见[datasets](datasets/data.json)
|
||||||
|
|
||||||
### 🎨增量预训练、微调指南
|
### 🎨增量预训练、微调指南
|
||||||
|
@ -235,7 +235,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
|
|||||||
- Quick coding: [Baby EmoLLM](quick_start/Baby_EmoLLM.ipynb)
|
- Quick coding: [Baby EmoLLM](quick_start/Baby_EmoLLM.ipynb)
|
||||||
|
|
||||||
### 📌Data Construction
|
### 📌Data Construction
|
||||||
- Please read the [Data Construction Guide ](generate_data/tutorial_EN.md) for reference.
|
- Please read the [Data Construction Guide ](IOTLLM/generate_data/tutorial_EN.md) for reference.
|
||||||
- The dataset used for this fine-tuning can be found at [datasets](datasets/data.json)
|
- The dataset used for this fine-tuning can be found at [datasets](datasets/data.json)
|
||||||
|
|
||||||
### 🎨Incremental Pre-training and Fine-tuning Guide
|
### 🎨Incremental Pre-training and Fine-tuning Guide
|
||||||
|
@ -225,7 +225,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
|
|||||||
|
|
||||||
### 📌データ構築
|
### 📌データ構築
|
||||||
|
|
||||||
- [データ構築ガイド](generate_data/tutorial_EN.md)を参照してください。
|
- [データ構築ガイド](IOTLLM/generate_data/tutorial_EN.md)を参照してください。
|
||||||
|
|
||||||
- この微調整に使用されたデータセットは[datasets](datasets/data.json)にあります。
|
- この微調整に使用されたデータセットは[datasets](datasets/data.json)にあります。
|
||||||
|
|
||||||
|
@ -36,3 +36,6 @@ websocket-client~=1.6.2
|
|||||||
gensim~=4.3.3
|
gensim~=4.3.3
|
||||||
pillow~=9.5.0
|
pillow~=9.5.0
|
||||||
natsort~=8.4.0
|
natsort~=8.4.0
|
||||||
|
jsonlines~=4.0.0
|
||||||
|
django~=5.1.2
|
||||||
|
scikit-learn~=1.3.2
|
Loading…
Reference in New Issue
Block a user