[Update] modified and add files related to fintuning with internlm2_7b_base (#116)

This commit is contained in:
xzw 2024-03-21 15:47:35 +08:00 committed by GitHub
commit e0ec624943
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 295 additions and 18 deletions

View File

@ -7552,9 +7552,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{
@ -8540,9 +8537,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{
@ -13389,9 +13383,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{
@ -19973,9 +19964,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{

View File

@ -1,12 +1,25 @@
import json
# 打开JSON文件并读取其内容
with open('/root/Emollm/datasets/multi_turn_dataset_2.json', 'rt', encoding='utf-8') as file:
# file_name = 'multi_turn_dataset_1.json'
# file_name = 'multi_turn_dataset_2.json'
# file_name = 'data_pro.json'
file_name = 'data.json'
with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
data = json.load(file)
n = 0
for i in data:
i['conversation'][0]['system'] = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
try:
i['conversation'][0]['system'] = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
except:
print(n,i) # 4 empty lines in data.json 425 483 742 1120
n+=1
with open('output2.json', 'wt', encoding='utf-8') as file:
with open(f'processed_{file_name}', 'wt', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
print(data[0])

View File

@ -0,0 +1,34 @@
import os
import json
# 设置目录路径这里假设你的JSON文件都在当前目录下的directory_path文件夹中
directory_path = './'
# 初始化一个空列表用于存储所有JSON文件的数据
combined_list = []
# 遍历指定目录下的所有文件
for filename in os.listdir(directory_path):
# 检查文件扩展名是否为.json
if filename.endswith('.json'):
# 构建文件的完整路径
file_path = os.path.join(directory_path, filename)
# 打开并读取JSON文件
with open(file_path, 'r', encoding='utf-8') as json_file:
# 加载JSON文件的内容
data = json.load(json_file)
# 将读取到的数据添加到combined_list中
# 假设每个JSON文件包含的是一个列表如果不是可以根据实际情况调整
if isinstance(data, list):
combined_list.extend(data)
else:
combined_list.append(data)
# 打印合并后的列表 very large and slow
# print(combined_list)
# 如果需要可以将合并后的列表保存到一个新的JSON文件中
with open('combined_data.json', 'w', encoding='utf-8') as combined_json_file:
json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4)

View File

@ -0,0 +1,27 @@
import json
# 打开JSON文件并读取其内容
# file_name = 'single_turn_dataset_1.json'
file_name = 'single_turn_dataset_2.json'
with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
format1_data = json.load(file)
system = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
# 转换为格式2的数据
format2_data = []
for item in format1_data:
conversation = {
"system": system,
"input": item["prompt"],
"output": item["completion"]
}
format2_data.append({"conversation": [conversation]})
# 将转换后的数据转换为JSON格式
with open(f'./processed_{file_name}', 'wt', encoding='utf-8') as file:
json.dump(format2_data, file, ensure_ascii=False, indent=4)
print(format2_data[0])

View File

@ -0,0 +1,203 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import template_map_fn_factory
from xtuner.engine import DatasetInfoHook, EvaluateChatHook
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
# pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b'
pretrained_model_name_or_path = '/root/share/model_repos/internlm2-base-7b'
# Data
# data_path = 'merge.json'
data_path ='/root/StableCascade/emollm2/EmoLLM/datasets/processed/combined_data.json'
# https://github.com/InternLM/xtuner/blob/main/xtuner/utils/templates.py#L24C25-L24C25
prompt_template = PROMPT_TEMPLATE.internlm2_chat # there is No internlm2_base
max_length = 2048
pack_to_max_length = True
# Scheduler & Optimizer
# batch_size = 8 # per_device
# accumulative_counts = 2
batch_size = 16 # per_device
accumulative_counts = 1
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Evaluate the generation performance during the training
evaluation_freq = 500
# SYSTEM = "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。"
SYSTEM = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
evaluation_inputs = [
'我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。'
]
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')
model = dict(
type=SupervisedFinetune,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')),
lora=dict(
type=LoraConfig,
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias='none',
task_type='CAUSAL_LM'))
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
alpaca_en = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=None,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length)
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=alpaca_en,
sampler=dict(type=DefaultSampler, shuffle=True),
collate_fn=dict(type=default_collate_fn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
T_max=max_epochs,
convert_to_iter_based=True)
]
# train, val, test setting
train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
dict(type=DatasetInfoHook, tokenizer=tokenizer),
dict(
type=EvaluateChatHook,
tokenizer=tokenizer,
every_n_iters=evaluation_freq,
evaluation_inputs=evaluation_inputs,
system=SYSTEM,
prompt_template=prompt_template)
]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 100 iterations.
logger=dict(type=LoggerHook, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per epoch.
checkpoint=dict(type=CheckpointHook, interval=1),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

View File

@ -1,11 +1,23 @@
datasets==2.16.1
deepspeed==0.13.1
einops==0.7.0
flash_attn==2.5.0
mmengine==0.10.2
openxlab==0.0.34
peft==0.7.1
sentencepiece==0.1.99
torch==2.1.2
transformers==4.36.2
xtuner==0.1.11
# modified version
# xtuner==0.1.11
# mmengine==0.10.2
mmengine==0.10.3
xtuner==0.1.15
# flash_attn==2.5.0 # build is very slow about 2 hours?
# method 1: https://github.com/Dao-AILab/flash-attention/releases
# flash_attn-2.5.0+cu122torch2.1cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
# method 2:
# pip install /root/share/wheels/flash_attn-2.4.2+cu118torch2.0cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
# mpi4py==3.1.5 # conda install mpi4py