This commit is contained in:
xzw 2024-03-21 15:58:09 +08:00 committed by GitHub
commit 412c80aef3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 305 additions and 29 deletions

View File

@ -44,17 +44,16 @@
## 数据集去重
结合绝对匹配以及模糊匹配(Simhash)算法,对数据集进行去重以提升微调模型的效果。在确保数据集的高质量的同时,通过调整阈值减少因错误匹配而丢失重要数据的风险。
Simhash算法
**Simhash算法介绍**
Simhash相似性哈希是一种用于检测大量数据中相似或重复项的算法。它通过将文本转换为一组数值指纹来工作这些指纹对相似的文本具有高度的相似性。Simhash算法对于处理文本数据特别有效尤其是在处理大量数据时。
实现步骤:
文本预处理将文本数据转换为适合Simhash处理的格式。这可能包括分词、去除停用词、词干提取等。
生成Simhash指纹对预处理后的文本应用Simhash算法生成一组数值指纹。每个指纹代表文本内容的一个哈希值。
比较指纹通过比较哈希值的相似性来识别重复或相似的记录。Simhash的特点是即使在文本有少量差异时生成的哈希值也具有较高的相似性。
确定阈值:设置一个相似性阈值,只有当两个指纹的相似度超过这个阈值时,才认为它们代表相似或重复的记录。
处理相似记录:对于被标记为相似的记录,可以进一步人工审查或自动合并,以消除重复。
**Simhash实现步骤**
*文本预处理将文本数据转换为适合Simhash处理的格式。这可能包括分词、去除停用词、词干提取等。
*生成Simhash指纹对预处理后的文本应用Simhash算法生成一组数值指纹。每个指纹代表文本内容的一个哈希值。
*比较指纹通过比较哈希值的相似性来识别重复或相似的记录。Simhash的特点是即使在文本有少量差异时生成的哈希值也具有较高的相似性。
*确定阈值:设置一个相似性阈值,只有当两个指纹的相似度超过这个阈值时,才认为它们代表相似或重复的记录。
*处理相似记录:对于被标记为相似的记录,可以进一步人工审查或自动合并,以消除重复。
## 用法
### deduplicate.py
`deduplicate.py` 用于将datasets下以模型命名的文件夹下(例如:'datasets/qwen').json数据进行去重输出去重后的数据到 `datasets/qwen/dedup` 文件夹下。

View File

@ -7552,9 +7552,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{
@ -8540,9 +8537,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{
@ -13389,9 +13383,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{
@ -19973,9 +19964,6 @@
}
]
},
{
"conversation": []
},
{
"conversation": [
{

View File

@ -1,12 +1,25 @@
import json
# 打开JSON文件并读取其内容
with open('/root/Emollm/datasets/multi_turn_dataset_2.json', 'rt', encoding='utf-8') as file:
# file_name = 'multi_turn_dataset_1.json'
# file_name = 'multi_turn_dataset_2.json'
# file_name = 'data_pro.json'
file_name = 'data.json'
with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
data = json.load(file)
n = 0
for i in data:
i['conversation'][0]['system'] = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
try:
i['conversation'][0]['system'] = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
except:
print(n,i) # 4 empty lines in data.json 425 483 742 1120
n+=1
with open('output2.json', 'wt', encoding='utf-8') as file:
with open(f'processed_{file_name}', 'wt', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
print(data[0])

View File

@ -0,0 +1,34 @@
import os
import json
# 设置目录路径这里假设你的JSON文件都在当前目录下的directory_path文件夹中
directory_path = './'
# 初始化一个空列表用于存储所有JSON文件的数据
combined_list = []
# 遍历指定目录下的所有文件
for filename in os.listdir(directory_path):
# 检查文件扩展名是否为.json
if filename.endswith('.json'):
# 构建文件的完整路径
file_path = os.path.join(directory_path, filename)
# 打开并读取JSON文件
with open(file_path, 'r', encoding='utf-8') as json_file:
# 加载JSON文件的内容
data = json.load(json_file)
# 将读取到的数据添加到combined_list中
# 假设每个JSON文件包含的是一个列表如果不是可以根据实际情况调整
if isinstance(data, list):
combined_list.extend(data)
else:
combined_list.append(data)
# 打印合并后的列表 very large and slow
# print(combined_list)
# 如果需要可以将合并后的列表保存到一个新的JSON文件中
with open('combined_data.json', 'w', encoding='utf-8') as combined_json_file:
json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4)

View File

@ -0,0 +1,27 @@
import json
# 打开JSON文件并读取其内容
# file_name = 'single_turn_dataset_1.json'
file_name = 'single_turn_dataset_2.json'
with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file:
format1_data = json.load(file)
system = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
# 转换为格式2的数据
format2_data = []
for item in format1_data:
conversation = {
"system": system,
"input": item["prompt"],
"output": item["completion"]
}
format2_data.append({"conversation": [conversation]})
# 将转换后的数据转换为JSON格式
with open(f'./processed_{file_name}', 'wt', encoding='utf-8') as file:
json.dump(format2_data, file, ensure_ascii=False, indent=4)
print(format2_data[0])

View File

@ -0,0 +1,203 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import template_map_fn_factory
from xtuner.engine import DatasetInfoHook, EvaluateChatHook
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
# pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b'
pretrained_model_name_or_path = '/root/share/model_repos/internlm2-base-7b'
# Data
# data_path = 'merge.json'
data_path ='/root/StableCascade/emollm2/EmoLLM/datasets/processed/combined_data.json'
# https://github.com/InternLM/xtuner/blob/main/xtuner/utils/templates.py#L24C25-L24C25
prompt_template = PROMPT_TEMPLATE.internlm2_chat # there is No internlm2_base
max_length = 2048
pack_to_max_length = True
# Scheduler & Optimizer
# batch_size = 8 # per_device
# accumulative_counts = 2
batch_size = 16 # per_device
accumulative_counts = 1
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Evaluate the generation performance during the training
evaluation_freq = 500
# SYSTEM = "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。"
SYSTEM = "你是心理健康助手EmoLLM由EmoLLM团队打造。你旨在通过专业心理咨询协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术一步步帮助来访者解决心理问题。"
evaluation_inputs = [
'我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。'
]
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')
model = dict(
type=SupervisedFinetune,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')),
lora=dict(
type=LoraConfig,
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias='none',
task_type='CAUSAL_LM'))
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
alpaca_en = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=None,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length)
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=alpaca_en,
sampler=dict(type=DefaultSampler, shuffle=True),
collate_fn=dict(type=default_collate_fn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
T_max=max_epochs,
convert_to_iter_based=True)
]
# train, val, test setting
train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
dict(type=DatasetInfoHook, tokenizer=tokenizer),
dict(
type=EvaluateChatHook,
tokenizer=tokenizer,
every_n_iters=evaluation_freq,
evaluation_inputs=evaluation_inputs,
system=SYSTEM,
prompt_template=prompt_template)
]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 100 iterations.
logger=dict(type=LoggerHook, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per epoch.
checkpoint=dict(type=CheckpointHook, interval=1),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

View File

@ -1,11 +1,23 @@
datasets==2.16.1
deepspeed==0.13.1
einops==0.7.0
flash_attn==2.5.0
mmengine==0.10.2
openxlab==0.0.34
peft==0.7.1
sentencepiece==0.1.99
torch==2.1.2
transformers==4.36.2
xtuner==0.1.11
# modified version
# xtuner==0.1.11
# mmengine==0.10.2
mmengine==0.10.3
xtuner==0.1.15
# flash_attn==2.5.0 # build is very slow about 2 hours?
# method 1: https://github.com/Dao-AILab/flash-attention/releases
# flash_attn-2.5.0+cu122torch2.1cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
# method 2:
# pip install /root/share/wheels/flash_attn-2.4.2+cu118torch2.0cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
# mpi4py==3.1.5 # conda install mpi4py