From 950cab02625bc886dd37af3f03e7477d38a4906b Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 15:24:45 +0900 Subject: [PATCH 1/8] optimize deduplicate.py Add time print information save duplicate dataset as well remove print(content) --- datasets/deduplicate.py | 49 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/datasets/deduplicate.py b/datasets/deduplicate.py index 54137cb..4ba7851 100644 --- a/datasets/deduplicate.py +++ b/datasets/deduplicate.py @@ -5,6 +5,9 @@ from datasketch import MinHash from hashlib import md5 from simhash import Simhash +import time +import numpy as np + def extract_text_from_json(obj, content): # print(content) if isinstance(obj, dict): @@ -29,7 +32,7 @@ def is_duplicate_absolutely(d1, d2): def hash_dict(dict_obj): content = extract_text_from_json(dict_obj,'') content = content.replace('\n', '').replace('\t', '').replace(' ', '') - print(content) + # print(content) # m = get_minhash(content) m = Simhash(content) return m @@ -43,10 +46,19 @@ def get_simhash(dict_obj): return Simhash(dict_obj) # 使用绝对匹配和MinHash对dict列表去重 -def deduplicate_json(data_list, threshold=0.8): +def deduplicate_json(data_list, threshold=0.8, time_print=True): seen_hashes = [] keep = [] duplicate = [] + + # global start + start = time.time() + last_start_seen_hashes = start + last_start_duplicate = start + stop1 = 0 + stop2 = 0 + print_interval = 500 + for item in data_list: if not item['conversation']: continue @@ -60,15 +72,36 @@ def deduplicate_json(data_list, threshold=0.8): has_similar = False # for stored_min_hash, stored_text in seen_hashes: # if stored_min_hash.jaccard(min_hash) > threshold: + for stored_min_hash, stored_text in seen_hashes: if 1 - (stored_min_hash.distance(sim_hash)/64.0) > threshold: has_similar = True duplicate.append(item) + + print_len_duplicate = len(duplicate)+1 + if print_len_duplicate%print_interval == 0: + if time_print: + stop1 = time.time() + print(f'print_len_duplicate={print_len_duplicate} Time: ', np.round(stop1 - last_start_duplicate, 5), np.round(stop1 - start , 5)) + last_start_duplicate = stop1 + else: + print(f'print_len_duplicate={print_len_duplicate}') + break if not has_similar: - # seen_hashes.append((min_hash,item)) + seen_hashes.append((sim_hash,item)) keep.append(item) + + + print_len_seen_hashes = len(seen_hashes)+1 + if print_len_seen_hashes%print_interval == 0: + if time_print: + stop2 = time.time() + print(f'print_len_seen_hashes={print_len_seen_hashes} Time: ', str(np.round(stop2 - last_start_seen_hashes,5)), str(np.round(stop2 - start, 5))) + last_start_seen_hashes = stop2 + else: + print(f'print_len_seen_hashes={print_len_seen_hashes}') else: duplicate.append(item) @@ -77,7 +110,8 @@ def deduplicate_json(data_list, threshold=0.8): if __name__ == '__main__': DUP_THRESH = 0.8 - data_ai = 'qwen' + data_ai = 'FatherLikeBF' + # root_dir = rf'./datasets/{data_ai}/' root_dir = rf'./{data_ai}/' dedup_output_dir = os.path.join(root_dir,'dedup') if not os.path.exists(dedup_output_dir): @@ -93,9 +127,14 @@ if __name__ == '__main__': if is_json_file(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) - dedup_data, duplicate = deduplicate_json(data, DUP_THRESH) + dedup_data, duplicate = deduplicate_json(data, DUP_THRESH) + with open(os.path.join(root_dir, 'dedup','dedup_' + file), 'w', encoding='utf-8') as output_file: json.dump(dedup_data, output_file, ensure_ascii=False, indent=4) + + with open(os.path.join(root_dir, 'dedup','dup_' + file), 'w', encoding='utf-8') as output_file: + json.dump(duplicate, output_file, ensure_ascii=False, indent=4) + for item in dedup_data: logger.info(f'dedup_data: {item}') for item in duplicate: From 252adc7eefbb5a661ad8c877beffd6643cebdf06 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 15:25:37 +0900 Subject: [PATCH 2/8] add base model qlora fintuning config file: internlm2_7b_base_qlora_e10_M_1e4_32_64.py --- ...internlm2_7b_base_qlora_e10_M_1e4_32_64.py | 210 ++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 xtuner_config/internlm2_7b_base_qlora_e10_M_1e4_32_64.py diff --git a/xtuner_config/internlm2_7b_base_qlora_e10_M_1e4_32_64.py b/xtuner_config/internlm2_7b_base_qlora_e10_M_1e4_32_64.py new file mode 100644 index 0000000..35c3519 --- /dev/null +++ b/xtuner_config/internlm2_7b_base_qlora_e10_M_1e4_32_64.py @@ -0,0 +1,210 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +# pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b' +pretrained_model_name_or_path = '/root/share/model_repos/internlm2-base-7b' + +# Data +# data_path = 'merge.json' +data_path ='/root/StableCascade/emollm2/EmoLLM/datasets/processed/combined_data.json' + +# https://github.com/InternLM/xtuner/blob/main/xtuner/utils/templates.py#L24C25-L24C25 +prompt_template = PROMPT_TEMPLATE.internlm2_chat # there is No internlm2_base + +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer + +# batch_size = 8 # per_device +# accumulative_counts = 2 +batch_size = 16 # per_device +accumulative_counts = 1 + +dataloader_num_workers = 0 +max_epochs = 10 +optim_type = AdamW +lr = 1e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Evaluate the generation performance during the training +evaluation_freq = 500 +# SYSTEM = "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。" +SYSTEM = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" +evaluation_inputs = [ + '我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', + '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。', + # ['我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', + # '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。'] +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + # r=64, + # lora_alpha=16, + r=32, + lora_alpha=64, + # r=16, + # lora_alpha=32, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=None, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + T_max=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) From df81a99f5313852f42aa2f5ed6a10cbaf7bcfa69 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 15:26:01 +0900 Subject: [PATCH 3/8] add full finetune code from internlm2 --- ...chat_7b_full_finetune_custom_dataset_e1.py | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 xtuner_config/internlm2_chat_7b_full_finetune_custom_dataset_e1.py diff --git a/xtuner_config/internlm2_chat_7b_full_finetune_custom_dataset_e1.py b/xtuner_config/internlm2_chat_7b_full_finetune_custom_dataset_e1.py new file mode 100644 index 0000000..7e3336e --- /dev/null +++ b/xtuner_config/internlm2_chat_7b_full_finetune_custom_dataset_e1.py @@ -0,0 +1,222 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Data format: +[ + { + "conversation": [ + { + "system": "", + "input": "xxx", + "output": "xxx" + }, + { + "input": "xxx", + "output": "xxx" + } + ] + }, +... +] +Please refer to https://github.com/InternLM/xtuner/blob/main/docs/en/user_guides/dataset_format.md for details. +""" # noqa: E501 +from datasets import load_dataset +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from torch.optim import AdamW +from torch.utils.data import BatchSampler +from transformers import AutoModelForCausalLM, AutoTokenizer + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import template_map_fn_factory +from xtuner.dataset.samplers import InternRepoSampler +from xtuner.engine import (DatasetInfoHook, EvaluateChatHook, ThroughputHook, + VarlenAttnArgsToMessageHubHook) +from xtuner.engine.runner import TrainLoop +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'internlm/internlm2-chat-7b' +use_varlen_attn = True + +# Data +data_files = ['/path/to/json/file.json'] +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 32768 +pack_to_max_length = True + +# Scheduler & Optimizer +# batch size per device, set to 1 if `use_varlen_attn` = True +# To clarify, enlarging the batch size essentially enlarges the `max_length`. +# For example, doubling the max length is tantamount to doubling the batch size +batch_size = 1 +accumulative_counts = 1 # 1bs * 1acc * 64gpu = 64 batchsize +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 4e-5 +betas = (0.9, 0.95) +weight_decay = 0.01 +max_norm = 1 # grad clip +warm_up_ratio = 0.025 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + use_varlen_attn=use_varlen_attn, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + use_varlen_attn=use_varlen_attn, + dataset=dict(type=load_dataset, path='json', data_files=data_files), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=None, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=InternRepoSampler, shuffle=True, seed=1024), + batch_sampler=dict( + type=BatchSampler, drop_last=True, batch_size=batch_size), + collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', +) + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1 / 40, + by_epoch=True, + begin=0, + end=warm_up_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=lr * 0.15, + by_epoch=True, + begin=warm_up_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict( + type=DatasetInfoHook, tokenizer=tokenizer, + is_intern_repo_dataset=True), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template), + dict(type=ThroughputHook) +] + +if use_varlen_attn: + custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=1), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +log_processor = dict( + by_epoch=False, + window_size=1, + mean_pattern=r'.*(loss|time|data_time|grad_norm|tflops).*') From 01240019262cbaeaaca8e4106ebaf2a5e654a182 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 15:26:46 +0900 Subject: [PATCH 4/8] other 2 configs for base model --- .../internlm2_7b_base_qlora_e10_b8_16_32.py | 205 ++++++++++++++++++ .../internlm2_7b_base_qlora_e3_M_1e4_32_64.py | 203 +++++++++++++++++ 2 files changed, 408 insertions(+) create mode 100644 xtuner_config/internlm2_7b_base_qlora_e10_b8_16_32.py create mode 100644 xtuner_config/internlm2_7b_base_qlora_e3_M_1e4_32_64.py diff --git a/xtuner_config/internlm2_7b_base_qlora_e10_b8_16_32.py b/xtuner_config/internlm2_7b_base_qlora_e10_b8_16_32.py new file mode 100644 index 0000000..b8824d9 --- /dev/null +++ b/xtuner_config/internlm2_7b_base_qlora_e10_b8_16_32.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +# pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b' +pretrained_model_name_or_path = '/root/share/model_repos/internlm2-base-7b' + +# Data +# data_path = 'merge.json' +data_path ='/root/StableCascade/emollm2/EmoLLM/datasets/processed/combined_data.json' + +# https://github.com/InternLM/xtuner/blob/main/xtuner/utils/templates.py#L24C25-L24C25 +prompt_template = PROMPT_TEMPLATE.internlm2_chat # there is No internlm2_base + +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer + +# batch_size = 8 # per_device +# accumulative_counts = 2 +batch_size = 8 # per_device +accumulative_counts = 1 + +dataloader_num_workers = 0 +max_epochs = 10 +optim_type = AdamW +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Evaluate the generation performance during the training +evaluation_freq = 500 +# SYSTEM = "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。" +SYSTEM = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" +evaluation_inputs = [ + '我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + # r=64, + # lora_alpha=16, + r=16, + lora_alpha=32, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=None, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + T_max=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner_config/internlm2_7b_base_qlora_e3_M_1e4_32_64.py b/xtuner_config/internlm2_7b_base_qlora_e3_M_1e4_32_64.py new file mode 100644 index 0000000..f46a55e --- /dev/null +++ b/xtuner_config/internlm2_7b_base_qlora_e3_M_1e4_32_64.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +# pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b' +pretrained_model_name_or_path = '/root/share/model_repos/internlm2-base-7b' + +# Data +# data_path = 'merge.json' +data_path ='/root/StableCascade/emollm2/EmoLLM/datasets/processed/combined_data.json' + +# https://github.com/InternLM/xtuner/blob/main/xtuner/utils/templates.py#L24C25-L24C25 +prompt_template = PROMPT_TEMPLATE.internlm2_chat # there is No internlm2_base + +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer + +# batch_size = 8 # per_device +# accumulative_counts = 2 +batch_size = 16 # per_device +accumulative_counts = 1 + +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = AdamW +lr = 1e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Evaluate the generation performance during the training +evaluation_freq = 500 +# SYSTEM = "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。" +SYSTEM = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" +evaluation_inputs = [ + '我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=32, + lora_alpha=64, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=None, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + T_max=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) From a22ec59be5aa5260fdc2b710bd827410acd8eb50 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 15:43:01 +0900 Subject: [PATCH 5/8] update cli_internlm2.py three methods to load model 1. download model in openxlab 2. download model in modelscope 3. offline model --- demo/cli_internlm2.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/demo/cli_internlm2.py b/demo/cli_internlm2.py index 0a5ba2e..9f14739 100644 --- a/demo/cli_internlm2.py +++ b/demo/cli_internlm2.py @@ -1,17 +1,23 @@ import torch from transformers import AutoTokenizer, AutoModelForCausalLM from openxlab.model import download +from modelscope import snapshot_download -download(model_repo='jujimeizuo/EmoLLM_Model', - output='model') +# download model in openxlab +model_name_or_path =download(model_repo='ajupyter/EmoLLM_internlm2_7b_full', + output='EmoLLM_internlm2_7b_full') -model_name_or_path = "model" +# download model in modelscope +model_name_or_path = snapshot_download('chg0901/EmoLLM-InternLM7B-base') + +# offline model +# model_name_or_path = "/root/StableCascade/emollm2/EmoLLM/xtuner_config/merged" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map='auto') model = model.eval() -system_prompt = "你是一个由aJupyter、Farewell、jujimeizuo、Smiling&Weeping研发(排名按字母顺序排序,不分先后)、散步提供技术支持、上海人工智能实验室提供支持开发的心理健康大模型。现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。" +system_prompt = '你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。' messages = [(system_prompt, '')] From affd90b177626079e63d8d1acb3f86c42117dfd3 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 15:45:11 +0900 Subject: [PATCH 6/8] create upload_modelscope.py --- xtuner_config/upload_modelscope.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 xtuner_config/upload_modelscope.py diff --git a/xtuner_config/upload_modelscope.py b/xtuner_config/upload_modelscope.py new file mode 100644 index 0000000..a7b43fd --- /dev/null +++ b/xtuner_config/upload_modelscope.py @@ -0,0 +1,10 @@ +from modelscope.hub.api import HubApi + +YOUR_ACCESS_TOKEN = '请从ModelScope个人中心->访问令牌获取' + +api = HubApi() +api.login(YOUR_ACCESS_TOKEN) +api.push_model( + model_id="yourname/your_model_id", + model_dir="my_model_dir" # 本地模型目录,要求目录中必须包含configuration.json +) \ No newline at end of file From 6e0042a54deec495c8992d2c304b6ff8e5fc28d7 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 16:05:05 +0900 Subject: [PATCH 7/8] add base model and update personal contributions --- README.md | 13 +++++++------ README_EN.md | 12 +++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9bda0a9..5416a9b 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,6 @@ 提出新特性 - **EmoLLM** 是一系列能够支持 **理解用户-支持用户-帮助用户** 心理健康辅导链路的心理健康大模型,由 `LLM`指令微调而来,欢迎大家star~⭐⭐。目前已经开源的 `LLM` 微调配置如下: @@ -49,6 +48,7 @@ | :-------------------: | :--------: | | InternLM2_7B_chat | QLORA | | InternLM2_7B_chat | 全量微调 | +| InternLM2_7B_base | QLORA | | InternLM2_1_8B_chat | 全量微调 | | InternLM2_20B_chat | LORA | | Qwen_7b_chat | QLORA | @@ -110,13 +110,14 @@ ### 🏆荣誉栏 + - 项目荣获上海人工智能实验室举办的**2024浦源大模型系列挑战赛春季赛*****50强***

浦语挑战赛TOP50

- + - 项目荣获公众号**NLP工程化**[推文宣传](https://mp.weixin.qq.com/s/78lrRl2tlXEKUfElnkVx4A) ### 🎯路线图 @@ -151,9 +152,10 @@ - [如何参与本项目](#如何参与本项目) - [作者(排名不分先后)](#作者排名不分先后) - [版权说明](#版权说明) + - [引用](#引用) - [特别鸣谢](#特别鸣谢) - [Star History](#star-history) - - [🌟Contributors](#-contributors) + - [🌟 Contributors](#-contributors) - [交流群](#交流群) ###### 开发前的配置要求 @@ -234,7 +236,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git | [ZeyuBa](https://github.com/ZeyuBa) | 自动化所在读硕士 | | | | [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | 宾夕法尼亚大学在读硕士 | | | | [Nobody-ML](https://github.com/Nobody-ML) | 中国石油大学(华东)在读本科生 | | | -| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora/) |MiniSora主要维护| 数据清洗、文档翻译 | +| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora/) |[MiniSora](https://github.com/mini-sora/minisora/)主要维护者,管理员| LLM微调、数据清洗、文档翻译 | | [Mxoder](https://github.com/Mxoder) | 北京航空航天大学在读本科生 | | | | [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | | | [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士(研0) | | | @@ -248,8 +250,8 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git 该项目签署了 MIT 授权许可,详情请参阅 [LICENSE](https://github.com/SmartFlowAI/EmoLLM/blob/main/LICENSE) - ### 引用 + 如果本项目对您的工作有所帮助,请使用以下格式引用: ```bibtex @@ -300,7 +302,6 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git [OpenXLab_App-url]: https://openxlab.org.cn/apps/detail/Farewell1/EmoLLMV2.0 [OpenXLab_Model-url]: https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_internlm2_7b_full - ## 交流群 - 如果失效,请移步Issue区 diff --git a/README_EN.md b/README_EN.md index 1199c53..eaa0b03 100644 --- a/README_EN.md +++ b/README_EN.md @@ -25,7 +25,7 @@

EmoLLM

- 简体中文 | English + 简体中文 | English

Explore the documentation of this project » @@ -42,7 +42,6 @@ - **EmoLLM** is a series of large language models designed to understand, support and help customers in mental health counseling. It is fine-tuned from the LLM instructions. We really appreciate it if you could give it a star~⭐⭐. The open-sourced configuration is as follows:

@@ -51,6 +50,7 @@ | :-------------------: | :------: | | InternLM2_7B_chat | QLORA | | InternLM2_7B_chat | full fine-tuning | +| InternLM2_7B_base | QLORA | | InternLM2_1_8B_chat | full fine-tuning | | InternLM2_20B_chat | LORA | | Qwen_7b_chat | QLORA | @@ -90,7 +90,6 @@ The Model aims to fully understand and promote the mental health of individuals, - 【2024.2.18】 The full fine-tuned version based on Qwen1_5-0_5B-Chat has been [open-sourced](https://www.modelscope.cn/models/aJupyter/EmoLLM_Qwen1_5-0_5B-Chat_full_sft/summary). Friends with limited computational resources can now dive in and explore it. -
View More @@ -173,8 +172,6 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git - [Deployment Guide](#deployment-guide) - View More Details - - ### File Directory Explanation ``` @@ -203,8 +200,8 @@ For details, see the [fine-tuning guide](xtuner_config/README.md) - Demo deployment: see [deployment guide](./demo/README.md) for details. - Quantitative deployment based on [LMDeploy](https://github.com/InternLM/lmdeploy/): see [deploy](./deploy/lmdeploy.md) - ### RAG (Retrieval Augmented Generation) Pipeline + - See [RAG](./rag/)
@@ -251,7 +248,7 @@ This project uses Git for version control. You can see the currently available v | [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | | | [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | | | [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | | -| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin| Data Cleaning and Docs Translation | +| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin of [MiniSora](https://github.com/mini-sora/minisora) | LLM Fine-Tuning, Data Cleaning and Docs Translation | | [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | | | [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | | [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | @@ -308,6 +305,7 @@ The project is licensed under the MIT License. Please refer to the details [OpenXLab_Model-url]: https://openxlab.org.cn/models/detail/ajupyter/EmoLLM_internlm2_7b_full ## Communication group + - If it fails, go to the Issue section.

From 383789e8697cf5cca741078d9ce18a25e2102a73 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 23 Mar 2024 19:52:52 +0900 Subject: [PATCH 8/8] Create README_internlm2_7b_base_qlora.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit InternLM2 7B Base QLoRA 微调指南 --- .../README_internlm2_7b_base_qlora.md | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 xtuner_config/README_internlm2_7b_base_qlora.md diff --git a/xtuner_config/README_internlm2_7b_base_qlora.md b/xtuner_config/README_internlm2_7b_base_qlora.md new file mode 100644 index 0000000..e0ca4b6 --- /dev/null +++ b/xtuner_config/README_internlm2_7b_base_qlora.md @@ -0,0 +1,192 @@ +# InternLM2 7B Base QLoRA 微调指南 + +## 模型基座与配置文件 + +- 本项目在[**internlm2_7b_chat_qlora_e3**模型](./internlm2_7b_chat_qlora_e3.py)微调[指南](./README.md)的基础上,更新了对[**internlm2_7b_base_qlora_e3(配置文件)**](./internlm2_7b_base_qlora_e10_M_1e4_32_64.py)**模型**的微调。 + +## 数据 + +### 数据集 + +只采用了通用的数据集,不包括带有Role-Play色彩的数据集,详情请查看[数据集](../datasets/README.md)页面 + +| Category | Dataset | Type | Total | +| :---------: | :-------------------: | :----------: | :-----: | +| General | data | Conversation | 5600+ | +| General | data_pro | Conversation | 36500+ | +| General | multi_turn_dataset_1 | Conversation | 36,000+ | +| General | multi_turn_dataset_2 | Conversation | 27,000+ | +| General | single_turn_dataset_1 | QA | 14000+ | +| General | single_turn_dataset_2 | QA | 18300+ | + +### 数据集处理 + +#### 数据格式 + +训练用的数据,被处理成单轮或者多轮对话的形式,如 + +```python +[ + # 多轮对话 + { + "conversation": [ + { + "input": "我最近很迷茫,高三后的压力让我感到恐惧,我对自己的存在感到怀疑,您能不能给我一些建议呢?", + "output": "看到你面临的困境与压力,真心希望能够帮助你。首先,我想说的是,你并不是孤独的,高考前的困惑与迷茫是很常见的。不要给自己太大的压力,给自己短暂的放松。你认识到问题所在,这已经是一个很大的进步了。", + "system": "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + }, + { + "input": "谢谢您的鼓励,但我觉得自己学习成绩不好,没有进步,这让我更加困惑。", + "output": "学习成绩好坏不是解决问题的关键,专注于自己的进步更为重要。为了达到自己的目标,制定一份合理的计划,按部就班的完成每一项任务。有时候自学不如求助他人,不要害怕问老师和同学问题,以便获得更好的帮助和理解。" + }, + { + "input": "我认为我缺乏自信,不知道怎样扭转自己的消极情绪,您能给我一些建议吗?", + "output": "建立自信需要时间和努力。让我们从一点一滴的进步中建立自信心。设立一些具体的目标,完成这些目标将会培养自己的自信。鼓励自己,在每天结束前,总结一下自己的成就,这将帮助你更好的看到自身的优点。" + }, + { + "input": "我感到很无助,不知道该如何调整自己的心态。", + "output": "了解自己的情绪并寻找有效的调节方法是处理感到无助情况的关键。比如爱好运动、听音乐和阅读书籍等,这些活动对于您的心理健康都是有积极帮助的。此外,在处理感到无助的情况下,家庭和朋友的支持也是非常关键的。" + }, + { + "input": "我感觉我的思维和学习方法都很糟糕,该怎样改进呢?", + "output": "学习方法确实很重要,它会对你的成绩和情绪有很大的影响。最有效的学习方法是利用时间来做练习,多做一些题目,练习自己的掌握能力。建立起对知识的兴趣,这需要不断的积累,不要害怕失败或挫折,它们会成为助你成长的宝贵财富。" + }, + { + "input": "我会按照您的建议努力,但想问一下,我该如何面对可能的失败?", + "output": "每个人都会面对失败和挫折,不要让它们打败你。碰到失败时,重新审视自己,分析分析自己的不足,寻找方法去改进。记得勇敢地看向未来,不要让过去的失败阻碍了自己前进的脚步。" + }, + { + "input": "非常感谢您的建议和支持,我会努力去实践。", + "output": "这是我应该做的,你有勇气面对自己困难,你并不孤独,我们会一起为你打气,祝愿你早日获得心灵的平静和所有的成功。" + } + ] + }, + # 单轮对话 + { + "conversation": [ + { + "system": "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。", + "input": "一想到事情就紧张……?\n我最近因为考试,觉得心情很不好,自己没有及格,而且,明明觉得自己比室友付出的多,努力的多,平时做题也是一样的,可是不知道为什么只有我不及格,感觉很苦恼。而且我只要一有什么事情就十分紧张,考试也是,心跳加速,手心出汗,而且大脑一片空白,有时甚至会头疼,高考以及现在的考试都是这样。而且我觉得一情绪激动的时候就会咳嗽,想哭哭不出来,感觉很难受,头也会疼。就是觉得心情不好,但是有不敢表现出来。我该怎么办呢?", + "output": "为什么不敢表现出来呢?试试表现出来,会怎样?\n有时候我们越抗拒,就会越强化。当我们接受了,反而症状就消失了。\n那我们要怎样接受呢?\n这本书《心灵的七种兵器》推荐看看哦,内容精简,又有真实事例可以帮助理解。\n我们为什么改不掉这个所谓的坏习惯。\n" + } + ] + }, +``` + +### 数据处理 + + - 使用 `../datasets/process.py` 以处理 **multi_turn_dataset(1 和 2,QA数据转单轮对话)**, `data.json` 和 `data_pro.json` 文件(两个多轮对话),以添加或者调整 **`system` prompt** + - 使用 `../datasets/processed/process_single_turn_conversation_construction.py` 处理 **single-turn dataset** (1 和 2),修改 (`input` 和 `ouput`) ,并在每次 **conversation** 中添加 **`system` prompt** + - 使用 `../datasets/processed/process_merge.py` 用于合并 `../datasets/processed/` 目录下**6个更新后的数据集**,生成一个合并后的数据集 `combined_data.json`用于最终训练 + +### 数据量与训练epochs设置 + +- 由于采用了更大的数据集,我们对模型进行了**10 epoch**的训练,读者可以根据训练过程中的输出和loss变化,进行训练的终止和模型的挑选,也可以采用更加专业的评估方法,来对模型评测。 +- 在我们公布的托管于OpenXlab微调后的 internlm2_7b_chat_qlora微调模型中,我们保留了两个版本,一个是[5 epoch模型](https://openxlab.org.cn/models/detail/chg0901/EmoLLM-InternLM7B-base/tree/main),另一个是[10 epoch模型](https://openxlab.org.cn/models/detail/chg0901/EmoLLM-InternLM7B-base-10e/tree/main)版本(**ModelScope**模型:[5 epoch模型](https://www.modelscope.cn/models/chg0901/EmoLLM-InternLM7B-base/files)和[10 epoch模型](https://www.modelscope.cn/models/chg0901/EmoLLM-InternLM7B-base-10e/files))。 + +## 基于XTuner的微调🎉🎉🎉🎉🎉 + +### 环境准备 + +```markdown +datasets==2.16.1 +deepspeed==0.13.1 +einops==0.7.0 +flash_attn==2.5.0 +openxlab==0.0.34 +peft==0.7.1 +sentencepiece==0.1.99 +torch==2.1.2 +transformers==4.36.2 +mmengine==0.10.3 +xtuner==0.1.15 +flash_attn==2.5.0 +``` + +也可以一键安装 + +```bash +cd xtuner_config/ +pip install -r requirements.txt +``` + +温馨提示:flash_attn的安装可能需要在本地编译,大约需要一到两小时,可以去[flash-attention](https://github.com/Dao-AILab/flash-attention/releases)中,查找和自己机器配置匹配的whl安装包或者采用InternLM AI studio提供的2.4.2版本whl安装包,自行安装,如: + +```bash +# from flash-attention +pip install flash_attn-2.5.0+cu122torch2.1cxx11abiTRUE-cp310-cp310-linux_x86_64.whl + +# from InternLM AI studio share folder +pip install /root/share/wheels/flash_attn-2.4.2+cu118torch2.0cxx11abiTRUE-cp310-cp310-linux_x86_64.whl +``` + +--- + +### 微调 + +```bash +cd xtuner_config/ +xtuner train internlm2_7b_base_qlora_e10_M_1e4_32_64.py --deepspeed deepspeed_zero2 +``` + +--- + +### 将得到的 PTH 模型转换为 HuggingFace 模型 + +**即:生成 Adapter 文件夹** + +```bash +cd xtuner_config/ +mkdir hf +export MKL_SERVICE_FORCE_INTEL=1 + +xtuner convert pth_to_hf internlm2_7b_base_qlora_e10_M_1e4_32_64.py ./work_dirs/internlm2_7b_base_qlora_e10_M_1e4_32_64/epoch_5.pth ./hf +``` + +--- + +### 将 HuggingFace adapter 合并到大语言模型 + +```bash +xtuner convert merge /root/share/model_repos/internlm2-base-7b ./hf ./merged --max-shard-size 2GB +# xtuner convert merge \ +# ${NAME_OR_PATH_TO_LLM} \ +# ${NAME_OR_PATH_TO_ADAPTER} \ +# ${SAVE_PATH} \ +# --max-shard-size 2GB +``` + +### 10 epoch 模型的处理 + +```bash + +cd xtuner_config/ +mkdir hf10 +export MKL_SERVICE_FORCE_INTEL=1 + +xtuner convert pth_to_hf internlm2_7b_base_qlora_e10_M_1e4_32_64.py ./work_dirs/internlm2_7b_base_qlora_e10_M_1e4_32_64/epoch_10.pth ./hf + +xtuner convert merge /root/share/model_repos/internlm2-base-7b ./hf10 ./merged10 --max-shard-size 2GB +# xtuner convert merge \ +# ${NAME_OR_PATH_TO_LLM} \ +# ${NAME_OR_PATH_TO_ADAPTER} \ +# ${SAVE_PATH} \ +# --max-shard-size 2GB +``` + +--- + +### 测试 + +```bash +cd demo/ +python cli_internlm2.py +``` + +--- + +## 其他 + +欢迎大家给[xtuner](https://github.com/InternLM/xtuner)和[EmoLLM](https://github.com/aJupyter/EmoLLM)点点star~ + +🎉🎉🎉🎉🎉