Add files via upload

This commit is contained in:
এ許我辞忧࿐♡ 2024-01-22 16:33:51 +08:00 committed by GitHub
parent 7a36847479
commit 5651fc181e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 409 additions and 0 deletions

136
scripts/Gen/SparkApi.py Normal file
View File

@ -0,0 +1,136 @@
import _thread as thread
import base64
import datetime
import hashlib
import hmac
import json
from urllib.parse import urlparse
import ssl
from datetime import datetime
from time import mktime
from urllib.parse import urlencode
from wsgiref.handlers import format_date_time
import websocket # 使用websocket_client
answer = ""
class Ws_Param(object):
# 初始化
def __init__(self, APPID, APIKey, APISecret, Spark_url):
self.APPID = APPID
self.APIKey = APIKey
self.APISecret = APISecret
self.host = urlparse(Spark_url).netloc
self.path = urlparse(Spark_url).path
self.Spark_url = Spark_url
# 生成url
def create_url(self):
# 生成RFC1123格式的时间戳
now = datetime.now()
date = format_date_time(mktime(now.timetuple()))
# 拼接字符串
signature_origin = "host: " + self.host + "\n"
signature_origin += "date: " + date + "\n"
signature_origin += "GET " + self.path + " HTTP/1.1"
# 进行hmac-sha256进行加密
signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
digestmod=hashlib.sha256).digest()
signature_sha_base64 = base64.b64encode(signature_sha).decode(encoding='utf-8')
authorization_origin = f'api_key="{self.APIKey}", algorithm="hmac-sha256", headers="host date request-line", signature="{signature_sha_base64}"'
authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
# 将请求的鉴权参数组合为字典
v = {
"authorization": authorization,
"date": date,
"host": self.host
}
# 拼接鉴权参数生成url
url = self.Spark_url + '?' + urlencode(v)
# 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释比对相同参数时生成的url与自己代码生成的url是否一致
return url
# 收到websocket错误的处理
def on_error(ws, error):
print("### error:", error)
# 收到websocket关闭的处理
def on_close(ws,one,two):
print(" ")
# 收到websocket连接建立的处理
def on_open(ws):
thread.start_new_thread(run, (ws,))
def run(ws, *args):
data = json.dumps(gen_params(appid=ws.appid, domain= ws.domain,question=ws.question))
ws.send(data)
# 收到websocket消息的处理
def on_message(ws, message):
# print(message)
data = json.loads(message)
code = data['header']['code']
if code != 0:
print(f'请求错误: {code}, {data}')
ws.close()
else:
choices = data["payload"]["choices"]
status = choices["status"]
content = choices["text"][0]["content"]
print(content,end ="")
global answer
answer += content
# print(1)
if status == 2:
ws.close()
def gen_params(appid, domain,question):
"""
通过appid和用户的提问来生成请参数
"""
data = {
"header": {
"app_id": appid,
"uid": "1234"
},
"parameter": {
"chat": {
"domain": domain,
"temperature": 0.5,
"max_tokens": 2048
}
},
"payload": {
"message": {
"text": question
}
}
}
return data
def main(appid, api_key, api_secret, Spark_url,domain, question):
# print("星火:")
wsParam = Ws_Param(appid, api_key, api_secret, Spark_url)
websocket.enableTrace(False)
wsUrl = wsParam.create_url()
ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close, on_open=on_open)
ws.appid = appid
ws.question = question
ws.domain = domain
ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})

60
scripts/Gen/gen_Chat.py Normal file
View File

@ -0,0 +1,60 @@
import SparkApi
from prompt import *
from tqdm import tqdm
# 以下密钥信息从控制台获取
appid = "" # 填写控制台中获取的 APPID 信息
api_secret = "" # 填写控制台中获取的 APISecret 信息
api_key = "" # 填写控制台中获取的 APIKey 信息
# 用于配置大模型版本默认“general/generalv2”
domain = "general" # v1.5版本
# domain = "generalv2" # v2.0版本
# 云端环境的服务地址
Spark_url = "ws://spark-api.xf-yun.com/v1.1/chat" # v1.5环境的地址
# Spark_url = "ws://spark-api.xf-yun.com/v2.1/chat" # v2.0环境的地址
text = []
# length = 0
def getText(role, content):
jsoncon = {}
jsoncon["role"] = role
jsoncon["content"] = content
text.append(jsoncon)
return text
def getlength(text):
length = 0
for content in text:
temp = content["content"]
leng = len(temp)
length += leng
return length
def checklen(text):
while (getlength(text) > 8000):
del text[0]
return text
if __name__ == '__main__':
text.clear
file_name = 'train3.jsonl'
conversations = []
for i in tqdm(range(200)):
Input = prompt(random.randint(0, 16))
question = checklen(getText("user", Input))
SparkApi.answer = ""
SparkApi.main(appid, api_key, api_secret, Spark_url, domain, question)
getText("assistant", SparkApi.answer)
conversations.append(ChatGLM3_6B(SparkApi.answer))
for item in conversations:
save_jsonl(item, file_name)
conversations.clear()

60
scripts/Gen/gen_data.py Normal file
View File

@ -0,0 +1,60 @@
import SparkApi
from prompt import *
from tqdm import tqdm
# 以下密钥信息从控制台获取
appid = "" # 填写控制台中获取的 APPID 信息
api_secret = "" # 填写控制台中获取的 APISecret 信息
api_key = "" # 填写控制台中获取的 APIKey 信息
#用于配置大模型版本默认“general/generalv2”
domain = "general" # v1.5版本
# domain = "generalv2" # v2.0版本
#云端环境的服务地址
Spark_url = "ws://spark-api.xf-yun.com/v1.1/chat" # v1.5环境的地址
# Spark_url = "ws://spark-api.xf-yun.com/v2.1/chat" # v2.0环境的地址
text =[]
# length = 0
def getText(role,content):
jsoncon = {}
jsoncon["role"] = role
jsoncon["content"] = content
text.append(jsoncon)
return text
def getlength(text):
length = 0
for content in text:
temp = content["content"]
leng = len(temp)
length += leng
return length
def checklen(text):
while (getlength(text) > 8000):
del text[0]
return text
if __name__ == '__main__':
text.clear
file_name = 'a2.jsonl'
conversations = []
for i in range(1):
for j in tqdm(range(10)):
Input = prompt(i)
question = checklen(getText("user",Input))
SparkApi.answer =""
SparkApi.main(appid,api_key, api_secret, Spark_url, domain, question)
getText("assistant", SparkApi.answer)
conversations.append(xinghuo_api(SparkApi.answer))
if i % 2 == 0 :
save_jsonl(conversations, file_name)
conversations.clear()

151
scripts/Gen/prompt.py Normal file
View File

@ -0,0 +1,151 @@
import json
import random
import re
import copy
# new_data = [{'role': 'user', 'content': '你好'}, {'role': 'assistant', 'content': '你好!有什么我可以帮助您的吗?'}, {'role': 'user', 'content': '生成一段心理医生和病人的对话'}, {'role': 'assistant', 'content': '心理医生:你好,我是你的心理医生。请问你的名字是?\n\n病人我叫李明。\n\n心理医生很高兴见到你李明先生。你来咨询是因为什么问题呢\n\n病人最近我总是感觉很焦虑睡眠也不好。\n\n心理医生好的可以跟我具体说说你的情况吗你有什么压力或者担忧的事情吗\n\n病人其实我一直在担心工作上的表现觉得自己做得不够好还有家庭的问题。\n\n心理医生这些都是很常见的问题。你可以告诉我你在工作中遇到了什么困难吗我们可以一起探讨一下如何解决。\n\n病人我觉得自己的工作能力不够强经常被领导批评。而且我家里的情况也不是很好父母经常吵架让我很难受。\n\n心理医生我理解你的感受。这些问题确实会让人感到压力和焦虑。不过我们可以通过一些方法来缓解这种情况。比如说你可以尝试一些放松的活动比如瑜伽或者冥想来减轻压力和焦虑。同时你也可以考虑寻求家人或者朋友的帮助让他们给你提供一些支持和鼓励。\n\n病人好的我会试试的。谢谢你的建议。\n\n心理医生不用客气如果你有任何问题或者需要进一步的帮助随时可以联系我。'}]
# text2 = []
# data = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''}
# for val in new_data:
# if val['role'] == 'user':
# continue
#
# print(text2)
def save_jsonl(conversations, path_file):
# 把对话写入文件
with open(path_file, 'a+', encoding='utf-8') as f:
for conversation in conversations:
Json_String = json.dumps(conversation, ensure_ascii=False) + '\n'
f.write(Json_String)
# 生成输入提示词
def prompt(life_type=0):
emotions_lis = [
"钦佩",
"崇拜",
"欣赏",
"娱乐",
"焦虑",
"敬畏",
"尴尬",
"厌倦",
"冷静",
"困惑",
"渴望",
"厌恶",
"同情",
"痛苦",
"着迷",
"嫉妒",
"兴奋",
"恐惧",
"痛恨",
"有趣",
"快乐",
"怀旧",
"浪漫",
"悲伤",
"满意",
"性欲",
"同情",
"满足"
]
areas_of_life = [
"工作",
"学业(小学,初中,高中,大学,研究生,博士)",
"生活(衣,食,住,行等等)",
"身体",
"家人",
"朋友",
"社交",
"恋爱",
"就业",
"责任",
"爱好",
"环境",
"隐私",
"安全",
"梦想",
"自由"
]
# 输入数据处理
if life_type < 0:
raise ValueError('life_type must > 0')
emo = random.choice(emotions_lis)
life_type %= 16
Input = f'''你是一个研究过无数具有心理健康问题的病人与心理健康医生对话的专家,请你构造一些符合实际情况的具有心理健
康问题的病人和心理健康医生的连续的一段多轮对话记录要求病人的问题属于{areas_of_life[life_type]}场景具有{emo}情感医生的回复尽可能包含心理辅导知识并且能够一步步诱导病人说出自己的问题进而提供解决问题的可行方案注意构造的数据必须以医生的陈述为结束语请只返回完整的对话内容请以如下格式返回生成的数据
病人病人的咨询或陈述
医生医生的安抚和建议
'''
return Input
def xinghuo_api(content):
# 对话格式
conversation1 = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''}
conversation = {'input':'', 'output':''}
conversations = {'conversation':[]}
# temp = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''}
# 划分对话形式
dialogue = re.split('医生:|病人:', content)
# 对话前的数据处理
if dialogue[0] == '':
dialogue.pop(0)
# 一次对话
flag = False
for ind, item in enumerate(dialogue):
if flag == False:
if (ind + 1) % 2 == 1:
conversation1['input'] = dialogue[ind]
else:
conversation1['output'] = dialogue[ind]
if (ind + 1) % 2 == 0 or ind + 1 == len(dialogue):
temp = copy.deepcopy(conversation1)
conversations['conversation'].append(temp)
flag = True
continue
else:
if (ind+1)%2 == 1:
conversation['input'] = dialogue[ind]
else:
conversation['output'] = dialogue[ind]
if (ind+1)%2 == 0 or ind+1 == len(dialogue):
# 浅赋值只会是同一个变量必须要copy.deepcopy
# 若conversations['conversation'].append(conversation)后面改的话,~s里面的conversation也会改动
# 就会变成n个一样的数据这是我们不想看到的
temp = copy.deepcopy(conversation)
conversations['conversation'].append(temp)
return conversations
def ChatGLM3_6B(content):
# 对话格式
conversation = {'system': '现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input': '',
'output': ''}
conversations = []
# temp = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''}
# 划分对话形式
dialogue = re.split('医生:|病人:', content)
# 对话前的数据处理
if dialogue[0] == '':
dialogue.pop(0)
# 一次对话
for ind, item in enumerate(dialogue):
if (ind + 1) % 2 == 1:
conversation['input'] = dialogue[ind]
else:
conversation['output'] = dialogue[ind]
if (ind + 1) % 2 == 0 or ind + 1 == len(dialogue):
# 浅赋值只会是同一个变量必须要copy.deepcopy
# 若conversations['conversation'].append(conversation)后面改的话,~s里面的conversation也会改动
# 就会变成n个一样的数据这是我们不想看到的
temp = copy.deepcopy(conversation)
conversations.append(temp)
return conversations

2
scripts/Gen/说明.txt Normal file
View File

@ -0,0 +1,2 @@
gen_Chat 使用于生成ChatGLM3-6B的数据集
gen_data 适用于生成InternLM所需要的数据集