Merge branch 'SmartFlowAI:main' into main

This commit is contained in:
HongCheng 2024-04-29 11:03:52 +09:00 committed by GitHub
commit 65e37c0a4d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 140 additions and 0 deletions

View File

@ -0,0 +1,14 @@
## 一共有两个 .py 文件分别为Book_QA_process_Step_1.py和Book_QA_process_Step_2.py
### Book_QA_process_Step_1.py
该代码是将我们生成的QA对jsonl数据转换为json格式
### Book_QA_process_Step_2.py
该代码是将第一步生成的json格式数据转化为可用于指令微调的数据格式并添加system
{
"conversation": [
{
"system": "你由EmoLLM团队打造的心理健康助手......",
"input": "Question",
"output": "Answer"
}
]
}

View File

@ -0,0 +1,35 @@
import os
import json
# 设置目录路径,这里假设你的 .jsonl 文件都在当前目录下的directory_path文件夹中
directory_path = '../初步清洗的QA数据'
def convert_to_desired_format(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as f:
data = [json.loads(line) for line in f]
transformed_data = []
for entry in data:
transformed_entry = {
"prompt": entry["question"],
"completion": entry["answer"]
}
transformed_data.append(transformed_entry)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(transformed_data, f, ensure_ascii=False, indent=4)
# 遍历指定目录下的所有文件
for filename in os.listdir(directory_path):
for root, dirs, files in os.walk(directory_path):
# 遍历当前文件夹下的文件
for filename in files:
# 检查文件扩展名是否为.json
if filename.endswith('.jsonl'):
# 构建文件的完整路径
file_path = os.path.join(root, filename)
output_file = file_path.replace('.jsonl', '.json')
convert_to_desired_format(file_path, output_file)

View File

@ -0,0 +1,32 @@
import json
import os
# 打开JSON文件并读取其内容
directory_path = '../初步清洗的QA数据'
system = "你由EmoLLM团队打造的心理健康助手是一个研究过无数具有心理健康问题的病人与心理健康医生对话的心理专家, 在心理方面拥有广博的知识储备和丰富的研究咨询经验。请充分利用专业心理学知识,对用户提出的问题进行回答。"
format2_data = []
# 遍历指定目录下的所有文件
for filename in os.listdir(directory_path):
for root, dirs, files in os.walk(directory_path):
# 遍历当前文件夹下的文件
for filename in files:
# 检查文件扩展名是否为.json
if filename.endswith('.json'):
# 构建文件的完整路径
file_path = os.path.join(root, filename)
with open(file_path, 'rt', encoding='utf-8') as file:
format1_data = json.load(file)
for item in format1_data:
conversation = {
"system": system,
"input": item["prompt"],
"output": item["completion"]
}
format2_data.append({"conversation": [conversation]})
with open(f'processed_Book_QA.json', 'wt', encoding='utf-8') as file:
json.dump(format2_data, file, ensure_ascii=False, indent=4)

View File

@ -0,0 +1,59 @@
import json
# Given JSON data in string format
# original_json_data = """
# [
# {
# "conversation": [
# {"system": "system", "input": "input", "output": "output"},
# {"input": "input", "output": "output"},
# {"input": "input", "output": "output"}
# ]
# },
# {
# "conversation": [
# {"system": "system", "input": "input", "output": "output"},
# {"input": "input", "output": "output"},
# {"input": "input", "output": "output"}
# ]
# }
# ]
# """
# Parse the original JSON data into Python objects
def convert_xtuner_to_sharegpt(input_path, output_path):
with open(input_path, 'r', encoding='utf-8') as file:
data = json.load(file)
# Initialize a new list to hold transformed conversations
transformed_conversations = []
for conversation_group in data:
system = conversation_group["conversation"][0]["system"]
# Extract human and GPT inputs and outputs from each conversation pair
transformed_pairs = []
for pair in conversation_group["conversation"]:
# if "system" in pair:
# continue # Skip the initial system entry
transformed_pairs.append({"from": "human", "value": pair["input"]})
transformed_pairs.append({"from": "gpt", "value": pair["output"]})
# print(transformed_pairs)
# Add the transformed conversation group to the result list
transformed_conversation = {
"conversations": transformed_pairs,
"system": system,
}
transformed_conversations.append(transformed_conversation)
# Convert the transformed Python objects back into JSON format
with open(output_path, "w", encoding='utf-8') as output_file:
json.dump(transformed_conversations, output_file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
input_path = "../datasets/scientist.json"
output_path = "../datasets/scientist_sharegpt.json"
convert_xtuner_to_sharegpt(input_path, output_path)