From 5651fc181e67fc8957043176c950d84f6f9d21c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A6=8F=E8=A8=B1=E6=88=91=E8=BE=9E=E5=BF=A7=E0=BF=90?= =?UTF-8?q?=E2=99=A1?= <127636623+Smiling-Weeping-zhr@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:33:51 +0800 Subject: [PATCH] Add files via upload --- scripts/Gen/SparkApi.py | 136 ++++++++++++++++++++++++++++++++++++ scripts/Gen/gen_Chat.py | 60 ++++++++++++++++ scripts/Gen/gen_data.py | 60 ++++++++++++++++ scripts/Gen/prompt.py | 151 ++++++++++++++++++++++++++++++++++++++++ scripts/Gen/说明.txt | 2 + 5 files changed, 409 insertions(+) create mode 100644 scripts/Gen/SparkApi.py create mode 100644 scripts/Gen/gen_Chat.py create mode 100644 scripts/Gen/gen_data.py create mode 100644 scripts/Gen/prompt.py create mode 100644 scripts/Gen/说明.txt diff --git a/scripts/Gen/SparkApi.py b/scripts/Gen/SparkApi.py new file mode 100644 index 0000000..d0edbfc --- /dev/null +++ b/scripts/Gen/SparkApi.py @@ -0,0 +1,136 @@ +import _thread as thread +import base64 +import datetime +import hashlib +import hmac +import json +from urllib.parse import urlparse +import ssl +from datetime import datetime +from time import mktime +from urllib.parse import urlencode +from wsgiref.handlers import format_date_time + +import websocket # 使用websocket_client +answer = "" + +class Ws_Param(object): + # 初始化 + def __init__(self, APPID, APIKey, APISecret, Spark_url): + self.APPID = APPID + self.APIKey = APIKey + self.APISecret = APISecret + self.host = urlparse(Spark_url).netloc + self.path = urlparse(Spark_url).path + self.Spark_url = Spark_url + + # 生成url + def create_url(self): + # 生成RFC1123格式的时间戳 + now = datetime.now() + date = format_date_time(mktime(now.timetuple())) + + # 拼接字符串 + signature_origin = "host: " + self.host + "\n" + signature_origin += "date: " + date + "\n" + signature_origin += "GET " + self.path + " HTTP/1.1" + + # 进行hmac-sha256进行加密 + signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'), + digestmod=hashlib.sha256).digest() + + signature_sha_base64 = base64.b64encode(signature_sha).decode(encoding='utf-8') + + authorization_origin = f'api_key="{self.APIKey}", algorithm="hmac-sha256", headers="host date request-line", signature="{signature_sha_base64}"' + + authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8') + + # 将请求的鉴权参数组合为字典 + v = { + "authorization": authorization, + "date": date, + "host": self.host + } + # 拼接鉴权参数,生成url + url = self.Spark_url + '?' + urlencode(v) + # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释,比对相同参数时生成的url与自己代码生成的url是否一致 + return url + + +# 收到websocket错误的处理 +def on_error(ws, error): + print("### error:", error) + + +# 收到websocket关闭的处理 +def on_close(ws,one,two): + print(" ") + + +# 收到websocket连接建立的处理 +def on_open(ws): + thread.start_new_thread(run, (ws,)) + + +def run(ws, *args): + data = json.dumps(gen_params(appid=ws.appid, domain= ws.domain,question=ws.question)) + ws.send(data) + + +# 收到websocket消息的处理 +def on_message(ws, message): + # print(message) + data = json.loads(message) + code = data['header']['code'] + if code != 0: + print(f'请求错误: {code}, {data}') + ws.close() + else: + choices = data["payload"]["choices"] + status = choices["status"] + content = choices["text"][0]["content"] + print(content,end ="") + global answer + answer += content + # print(1) + if status == 2: + ws.close() + + +def gen_params(appid, domain,question): + """ + 通过appid和用户的提问来生成请参数 + """ + data = { + "header": { + "app_id": appid, + "uid": "1234" + }, + "parameter": { + "chat": { + "domain": domain, + "temperature": 0.5, + "max_tokens": 2048 + } + }, + "payload": { + "message": { + "text": question + } + } + } + return data + + +def main(appid, api_key, api_secret, Spark_url,domain, question): + # print("星火:") + wsParam = Ws_Param(appid, api_key, api_secret, Spark_url) + websocket.enableTrace(False) + wsUrl = wsParam.create_url() + ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close, on_open=on_open) + ws.appid = appid + ws.question = question + ws.domain = domain + ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE}) + + diff --git a/scripts/Gen/gen_Chat.py b/scripts/Gen/gen_Chat.py new file mode 100644 index 0000000..945fde9 --- /dev/null +++ b/scripts/Gen/gen_Chat.py @@ -0,0 +1,60 @@ +import SparkApi +from prompt import * +from tqdm import tqdm + +# 以下密钥信息从控制台获取 +appid = "" # 填写控制台中获取的 APPID 信息 +api_secret = "" # 填写控制台中获取的 APISecret 信息 +api_key = "" # 填写控制台中获取的 APIKey 信息 + +# 用于配置大模型版本,默认“general/generalv2” +domain = "general" # v1.5版本 +# domain = "generalv2" # v2.0版本 +# 云端环境的服务地址 +Spark_url = "ws://spark-api.xf-yun.com/v1.1/chat" # v1.5环境的地址 +# Spark_url = "ws://spark-api.xf-yun.com/v2.1/chat" # v2.0环境的地址 + + +text = [] + + +# length = 0 + +def getText(role, content): + jsoncon = {} + jsoncon["role"] = role + jsoncon["content"] = content + text.append(jsoncon) + return text + + +def getlength(text): + length = 0 + for content in text: + temp = content["content"] + leng = len(temp) + length += leng + return length + + +def checklen(text): + while (getlength(text) > 8000): + del text[0] + return text + + +if __name__ == '__main__': + text.clear + file_name = 'train3.jsonl' + conversations = [] + for i in tqdm(range(200)): + Input = prompt(random.randint(0, 16)) + question = checklen(getText("user", Input)) + SparkApi.answer = "" + SparkApi.main(appid, api_key, api_secret, Spark_url, domain, question) + getText("assistant", SparkApi.answer) + conversations.append(ChatGLM3_6B(SparkApi.answer)) + for item in conversations: + save_jsonl(item, file_name) + conversations.clear() + diff --git a/scripts/Gen/gen_data.py b/scripts/Gen/gen_data.py new file mode 100644 index 0000000..c73381c --- /dev/null +++ b/scripts/Gen/gen_data.py @@ -0,0 +1,60 @@ +import SparkApi +from prompt import * +from tqdm import tqdm + + +# 以下密钥信息从控制台获取 +appid = "" # 填写控制台中获取的 APPID 信息 +api_secret = "" # 填写控制台中获取的 APISecret 信息 +api_key = "" # 填写控制台中获取的 APIKey 信息 + +#用于配置大模型版本,默认“general/generalv2” +domain = "general" # v1.5版本 +# domain = "generalv2" # v2.0版本 +#云端环境的服务地址 +Spark_url = "ws://spark-api.xf-yun.com/v1.1/chat" # v1.5环境的地址 +# Spark_url = "ws://spark-api.xf-yun.com/v2.1/chat" # v2.0环境的地址 + + +text =[] + +# length = 0 + +def getText(role,content): + jsoncon = {} + jsoncon["role"] = role + jsoncon["content"] = content + text.append(jsoncon) + return text + +def getlength(text): + length = 0 + for content in text: + temp = content["content"] + leng = len(temp) + length += leng + return length + +def checklen(text): + while (getlength(text) > 8000): + del text[0] + return text + + + +if __name__ == '__main__': + text.clear + file_name = 'a2.jsonl' + conversations = [] + for i in range(1): + for j in tqdm(range(10)): + Input = prompt(i) + question = checklen(getText("user",Input)) + SparkApi.answer ="" + SparkApi.main(appid,api_key, api_secret, Spark_url, domain, question) + getText("assistant", SparkApi.answer) + conversations.append(xinghuo_api(SparkApi.answer)) + if i % 2 == 0 : + save_jsonl(conversations, file_name) + conversations.clear() + diff --git a/scripts/Gen/prompt.py b/scripts/Gen/prompt.py new file mode 100644 index 0000000..2236d63 --- /dev/null +++ b/scripts/Gen/prompt.py @@ -0,0 +1,151 @@ +import json +import random +import re +import copy +# new_data = [{'role': 'user', 'content': '你好'}, {'role': 'assistant', 'content': '你好!有什么我可以帮助您的吗?'}, {'role': 'user', 'content': '生成一段心理医生和病人的对话'}, {'role': 'assistant', 'content': '心理医生:你好,我是你的心理医生。请问你的名字是?\n\n病人:我叫李明。\n\n心理医生:很高兴见到你,李明先生。你来咨询是因为什么问题呢?\n\n病人:最近我总是感觉很焦虑,睡眠也不好。\n\n心理医生:好的,可以跟我具体说说你的情况吗?你有什么压力或者担忧的事情吗?\n\n病人:其实我一直在担心工作上的表现,觉得自己做得不够好,还有家庭的问题。\n\n心理医生:这些都是很常见的问题。你可以告诉我你在工作中遇到了什么困难吗?我们可以一起探讨一下如何解决。\n\n病人:我觉得自己的工作能力不够强,经常被领导批评。而且我家里的情况也不是很好,父母经常吵架,让我很难受。\n\n心理医生:我理解你的感受。这些问题确实会让人感到压力和焦虑。不过我们可以通过一些方法来缓解这种情况。比如说,你可以尝试一些放松的活动,比如瑜伽或者冥想,来减轻压力和焦虑。同时,你也可以考虑寻求家人或者朋友的帮助,让他们给你提供一些支持和鼓励。\n\n病人:好的,我会试试的。谢谢你的建议。\n\n心理医生:不用客气,如果你有任何问题或者需要进一步的帮助,随时可以联系我。'}] +# text2 = [] +# data = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''} +# for val in new_data: +# if val['role'] == 'user': +# continue +# +# print(text2) + +def save_jsonl(conversations, path_file): + # 把对话写入文件 + with open(path_file, 'a+', encoding='utf-8') as f: + for conversation in conversations: + Json_String = json.dumps(conversation, ensure_ascii=False) + '\n' + f.write(Json_String) + + +# 生成输入提示词 +def prompt(life_type=0): + emotions_lis = [ + "钦佩", + "崇拜", + "欣赏", + "娱乐", + "焦虑", + "敬畏", + "尴尬", + "厌倦", + "冷静", + "困惑", + "渴望", + "厌恶", + "同情", + "痛苦", + "着迷", + "嫉妒", + "兴奋", + "恐惧", + "痛恨", + "有趣", + "快乐", + "怀旧", + "浪漫", + "悲伤", + "满意", + "性欲", + "同情", + "满足" + ] + areas_of_life = [ + "工作", + "学业(小学,初中,高中,大学,研究生,博士)", + "生活(衣,食,住,行等等)", + "身体", + "家人", + "朋友", + "社交", + "恋爱", + "就业", + "责任", + "爱好", + "环境", + "隐私", + "安全", + "梦想", + "自由" + ] + + # 输入数据处理 + if life_type < 0: + raise ValueError('life_type must > 0') + + emo = random.choice(emotions_lis) + life_type %= 16 + + Input = f'''你是一个研究过无数具有心理健康问题的病人与心理健康医生对话的专家,请你构造一些符合实际情况的具有心理健 + 康问题的病人和心理健康医生的连续的一段多轮对话记录。要求病人的问题属于{areas_of_life[life_type]}场景,具有{emo}情感,医生的回复尽可能包含心理辅导知识,并且能够一步步诱导病人说出自己的问题进而提供解决问题的可行方案。注意,构造的数据必须以医生的陈述为结束语,请只返回完整的对话内容。请以如下格式返回生成的数据: + 病人:病人的咨询或陈述 + 医生:医生的安抚和建议 + ''' + return Input + +def xinghuo_api(content): + # 对话格式 + conversation1 = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''} + conversation = {'input':'', 'output':''} + conversations = {'conversation':[]} + # temp = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''} + # 划分对话形式 + dialogue = re.split('医生:|病人:', content) + # 对话前的数据处理 + if dialogue[0] == '': + dialogue.pop(0) + # 一次对话 + flag = False + for ind, item in enumerate(dialogue): + if flag == False: + if (ind + 1) % 2 == 1: + conversation1['input'] = dialogue[ind] + else: + conversation1['output'] = dialogue[ind] + + if (ind + 1) % 2 == 0 or ind + 1 == len(dialogue): + temp = copy.deepcopy(conversation1) + conversations['conversation'].append(temp) + flag = True + continue + + else: + if (ind+1)%2 == 1: + conversation['input'] = dialogue[ind] + else: + conversation['output'] = dialogue[ind] + if (ind+1)%2 == 0 or ind+1 == len(dialogue): + # 浅赋值只会是同一个变量,必须要copy.deepcopy + # 若conversations['conversation'].append(conversation)后面改的话,~s里面的conversation也会改动 + # 就会变成n个一样的数据(这是我们不想看到的) + temp = copy.deepcopy(conversation) + conversations['conversation'].append(temp) + + return conversations + +def ChatGLM3_6B(content): + # 对话格式 + conversation = {'system': '现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input': '', + 'output': ''} + conversations = [] + # temp = {'system':'现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。', 'input':'', 'output':''} + # 划分对话形式 + dialogue = re.split('医生:|病人:', content) + # 对话前的数据处理 + if dialogue[0] == '': + dialogue.pop(0) + # 一次对话 + for ind, item in enumerate(dialogue): + if (ind + 1) % 2 == 1: + conversation['input'] = dialogue[ind] + else: + conversation['output'] = dialogue[ind] + if (ind + 1) % 2 == 0 or ind + 1 == len(dialogue): + # 浅赋值只会是同一个变量,必须要copy.deepcopy + # 若conversations['conversation'].append(conversation)后面改的话,~s里面的conversation也会改动 + # 就会变成n个一样的数据(这是我们不想看到的) + temp = copy.deepcopy(conversation) + conversations.append(temp) + + return conversations \ No newline at end of file diff --git a/scripts/Gen/说明.txt b/scripts/Gen/说明.txt new file mode 100644 index 0000000..788611e --- /dev/null +++ b/scripts/Gen/说明.txt @@ -0,0 +1,2 @@ +gen_Chat 使用于生成ChatGLM3-6B的数据集 +gen_data 适用于生成InternLM所需要的数据集 \ No newline at end of file