From 92f1272ed3777f32700be623b758213af7586d59 Mon Sep 17 00:00:00 2001 From: jujimeizuo Date: Fri, 19 Jan 2024 15:49:10 +0800 Subject: [PATCH 1/3] chore: update script --- .gitignore | 3 ++- README.md | 5 +++++ data/zhipuai_gen_data.py | 40 ++++++++++++++++++++-------------------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 53e7220..de34f61 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ ESConv.json .DS_Store __pycache__/ -tmp/ \ No newline at end of file +tmp/ +data/zhipuai/ \ No newline at end of file diff --git a/README.md b/README.md index e0811a4..ab1f6aa 100644 --- a/README.md +++ b/README.md @@ -1 +1,6 @@ # EmoLLM + + +## 🌟 Contributors + +[![EmoLLM contributors](https://contrib.rocks/image?repo=aJupyter/EmoLLM&max=2000)](https://github.com/aJupyter/EmoLLM/graphs/contributors) \ No newline at end of file diff --git a/data/zhipuai_gen_data.py b/data/zhipuai_gen_data.py index 2d95b08..d8287a4 100644 --- a/data/zhipuai_gen_data.py +++ b/data/zhipuai_gen_data.py @@ -1,4 +1,5 @@ import os +import random import json from tqdm import tqdm from dotenv import load_dotenv @@ -22,10 +23,12 @@ def zhipu_api(data, emo): 医生:医生的安抚和建议 ''' + top_p = round(random.uniform(0.1, 0.9), 2) messages = getText('user', prompt) response = client.chat.completions.create( model='glm-4', messages=messages, + top_p=top_p, ) return response.choices[0].message.content @@ -47,6 +50,8 @@ def convert(conversation): def save_jsonl(data_lis, file_path): + if not os.path.exists(os.path.dirname(file_path)): + os.makedirs(os.path.dirname(file_path)) with open(file_path, 'w', encoding='utf-8') as f: for item in data_lis: f.write(json.dumps(item, ensure_ascii=False) + '\n') @@ -67,7 +72,7 @@ if __name__ == '__main__': "渴望", "厌恶", "同情", - "痛苦" + "痛苦", "着迷", "嫉妒", "兴奋", @@ -80,7 +85,6 @@ if __name__ == '__main__': "悲伤", "满意", "性欲", - "同情", "满足" ] areas_of_life = [ @@ -103,22 +107,18 @@ if __name__ == '__main__': ] conversation_lis = [] - idx = 0 - for area in areas_of_life: - j = 0 - for idx in tqdm(range(len(emotions_lis)), desc=f'data:{area}, emo:{emotions_lis[j]}'): - emo = emotions_lis[j] - res = zhipu_api(area, emo) - print(res) - if res == 'null': - print(area, emo, 'error') + for emo in emotions_lis: + for area in areas_of_life: + if os.path.exists(f'./zhipuai/{area}/{emo}.jsonl'): + print(f'./zhipuai/{area}/{emo}.jsonl exists') continue - conversation_lis.append(convert(res)) - if idx % 2 == 1: - save_jsonl(conversation_lis, f'./zhipuai_{idx}.jsonl') - conversation_lis = [] - idx += 1 - j += 1 - if len(conversation_lis) > 0: - save_jsonl(conversation_lis, f'./zhipuai.jsonl') - conversation_lis = [] \ No newline at end of file + for i in tqdm(range(5), desc='{emo}, {area}'.format(emo=emo, area=area)): + res = zhipu_api(area, emo) + print(res) + if res == 'null': + print(area, emo, 'error') + continue + conversation_lis.append(convert(res)) + save_jsonl(conversation_lis, f'./zhipuai/{area}/{emo}.jsonl') + print(f'generate ./zhipuai/{area}/{emo}.jsonl') + conversation_lis = [] From 48c67f7299f6468e8f0c0530b21476e6b0bb47cc Mon Sep 17 00:00:00 2001 From: jujimeizuo Date: Fri, 19 Jan 2024 15:52:53 +0800 Subject: [PATCH 2/3] update: merge_json.py --- data/merge_json.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/data/merge_json.py b/data/merge_json.py index 6b7d0c7..0171df9 100644 --- a/data/merge_json.py +++ b/data/merge_json.py @@ -3,38 +3,38 @@ import os def save_merge_json(data_lis, file_path): - import json - with open(file_path, 'wt', encoding='utf-8') as file: - json.dump(data_lis, file, ensure_ascii=False) + json.dump(data_lis, file, indent=4, ensure_ascii=False) def get_all_file_paths(folder_path): - # 确保传入的是一个目录 - if not os.path.isdir(folder_path): - raise ValueError(f"{folder_path} is not a valid directory") - - # 获取文件夹下所有文件的路径 - file_paths = [os.path.join(folder_path, file) for file in os.listdir( - folder_path) if os.path.isfile(os.path.join(folder_path, file))] - return file_paths + files = os.listdir(folder_path) + path = [] + for file in files: + file_path = os.path.join(folder_path, file) + if os.path.isdir(file_path): + path.extend(get_all_file_paths(file_path)) + else: + path.append(file_path) + return path if __name__ == '__main__': conversion_lis = [] + folder_path = '' # input + merge_path = '' # input + paths = get_all_file_paths(folder_path=folder_path) - for path in get_all_file_paths('res/'): + for path in paths: print(path) - - with open(path, 'rt', encoding='utf-8') as file: - for line in file: + with open(path, 'rt', encoding='utf-8') as lines: + for line in lines: # 移除行尾的换行符 - line = line.rstrip('\n') + line.rstrip('\n') # 解析JSON try: data = json.loads(line) conversion_lis.append(data) except json.JSONDecodeError as e: print(f"Error decoding JSON: {e}") - - save_merge_json(data_lis=conversion_lis, file_path='merge.json') + save_merge_json(data_lis=conversion_lis, file_path=merge_path) \ No newline at end of file From f246532984455b517d17e83abba0825775d04386 Mon Sep 17 00:00:00 2001 From: jupyter Date: Sat, 20 Jan 2024 23:46:46 +0800 Subject: [PATCH 3/3] ADD ft.config @aJupyter --- finetune/ft_config.py | 194 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 finetune/ft_config.py diff --git a/finetune/ft_config.py b/finetune/ft_config.py new file mode 100644 index 0000000..5b48fe2 --- /dev/null +++ b/finetune/ft_config.py @@ -0,0 +1,194 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b' + + +# Data +data_path = 'merge.json' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = AdamW +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。" +evaluation_inputs = [ + '我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=None, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + T_max=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False)