From c354ffd7e06ac3240ed5c083323d70a4899ee906 Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Thu, 21 Mar 2024 07:58:13 +0800 Subject: [PATCH 1/7] [DOC]update datesets/README.md --- datasets/README.md | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/datasets/README.md b/datasets/README.md index cf04b96..34ee167 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -44,17 +44,13 @@ ## 数据集去重 结合绝对匹配以及模糊匹配(Simhash)算法,对数据集进行去重以提升微调模型的效果。在确保数据集的高质量的同时,通过调整阈值减少因错误匹配而丢失重要数据的风险。 -Simhash算法 +**Simhash算法** Simhash(相似性哈希)是一种用于检测大量数据中相似或重复项的算法。它通过将文本转换为一组数值指纹来工作,这些指纹对相似的文本具有高度的相似性。Simhash算法对于处理文本数据特别有效,尤其是在处理大量数据时。 -实现步骤: -文本预处理:将文本数据转换为适合Simhash处理的格式。这可能包括分词、去除停用词、词干提取等。 - -生成Simhash指纹:对预处理后的文本应用Simhash算法,生成一组数值指纹。每个指纹代表文本内容的一个哈希值。 - -比较指纹:通过比较哈希值的相似性来识别重复或相似的记录。Simhash的特点是即使在文本有少量差异时,生成的哈希值也具有较高的相似性。 - -确定阈值:设置一个相似性阈值,只有当两个指纹的相似度超过这个阈值时,才认为它们代表相似或重复的记录。 - -处理相似记录:对于被标记为相似的记录,可以进一步人工审查或自动合并,以消除重复。 +**Simhash实现步骤** +*文本预处理:将文本数据转换为适合Simhash处理的格式。这可能包括分词、去除停用词、词干提取等。 +*生成Simhash指纹:对预处理后的文本应用Simhash算法,生成一组数值指纹。每个指纹代表文本内容的一个哈希值。 +*比较指纹:通过比较哈希值的相似性来识别重复或相似的记录。Simhash的特点是即使在文本有少量差异时,生成的哈希值也具有较高的相似性。 +*确定阈值:设置一个相似性阈值,只有当两个指纹的相似度超过这个阈值时,才认为它们代表相似或重复的记录。 +*处理相似记录:对于被标记为相似的记录,可以进一步人工审查或自动合并,以消除重复。 From e2025cc8ea0f078470252c8e524c216745a7e415 Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Thu, 21 Mar 2024 08:24:15 +0800 Subject: [PATCH 2/7] [DOC]update datesets/README.md --- datasets/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datasets/README.md b/datasets/README.md index 34ee167..a2c2385 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -44,7 +44,7 @@ ## 数据集去重 结合绝对匹配以及模糊匹配(Simhash)算法,对数据集进行去重以提升微调模型的效果。在确保数据集的高质量的同时,通过调整阈值减少因错误匹配而丢失重要数据的风险。 -**Simhash算法** +**Simhash算法介绍** Simhash(相似性哈希)是一种用于检测大量数据中相似或重复项的算法。它通过将文本转换为一组数值指纹来工作,这些指纹对相似的文本具有高度的相似性。Simhash算法对于处理文本数据特别有效,尤其是在处理大量数据时。 **Simhash实现步骤** @@ -54,3 +54,6 @@ Simhash(相似性哈希)是一种用于检测大量数据中相似或重复 *确定阈值:设置一个相似性阈值,只有当两个指纹的相似度超过这个阈值时,才认为它们代表相似或重复的记录。 *处理相似记录:对于被标记为相似的记录,可以进一步人工审查或自动合并,以消除重复。 +## 用法 +### deduplicate.py +`deduplicate.py` 用于将datasets下以模型命名的文件夹下(例如:'datasets/qwen').json数据进行去重,输出去重后的数据到 `datasets/qwen/dedup` 文件夹下。 \ No newline at end of file From d42f378eaa9e796faa97f19abd990ccc663c2c7d Mon Sep 17 00:00:00 2001 From: HongCheng Date: Thu, 21 Mar 2024 15:55:50 +0900 Subject: [PATCH 3/7] add internlm2_7b_base_qlora_e3.py and modify requirements.txt --- xtuner_config/internlm2_7b_base_qlora_e3.py | 203 ++++++++++++++++++++ xtuner_config/requirements.txt | 18 +- 2 files changed, 218 insertions(+), 3 deletions(-) create mode 100644 xtuner_config/internlm2_7b_base_qlora_e3.py diff --git a/xtuner_config/internlm2_7b_base_qlora_e3.py b/xtuner_config/internlm2_7b_base_qlora_e3.py new file mode 100644 index 0000000..7e5957f --- /dev/null +++ b/xtuner_config/internlm2_7b_base_qlora_e3.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +# pretrained_model_name_or_path = '/root/share/model_repos/internlm2-chat-7b' +pretrained_model_name_or_path = '/root/share/model_repos/internlm2-base-7b' + +# Data +# data_path = 'merge.json' +data_path ='/root/StableCascade/emollm2/EmoLLM/datasets/processed/combined_data.json' + +# https://github.com/InternLM/xtuner/blob/main/xtuner/utils/templates.py#L24C25-L24C25 +prompt_template = PROMPT_TEMPLATE.internlm2_chat # there is No internlm2_base + +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer + +# batch_size = 8 # per_device +# accumulative_counts = 2 +batch_size = 16 # per_device +accumulative_counts = 1 + +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = AdamW +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Evaluate the generation performance during the training +evaluation_freq = 500 +# SYSTEM = "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。" +SYSTEM = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" +evaluation_inputs = [ + '我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', '我知道应该理性看待,但就是忍不住会去比较。我甚至晚上会因为这个睡不着觉,总想着怎样才能像他那样出色。' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=None, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + T_max=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner_config/requirements.txt b/xtuner_config/requirements.txt index d637742..a2c983a 100644 --- a/xtuner_config/requirements.txt +++ b/xtuner_config/requirements.txt @@ -1,11 +1,23 @@ datasets==2.16.1 deepspeed==0.13.1 einops==0.7.0 -flash_attn==2.5.0 -mmengine==0.10.2 openxlab==0.0.34 peft==0.7.1 sentencepiece==0.1.99 torch==2.1.2 transformers==4.36.2 -xtuner==0.1.11 + +# modified version +# xtuner==0.1.11 +# mmengine==0.10.2 +mmengine==0.10.3 +xtuner==0.1.15 + +# flash_attn==2.5.0 # build is very slow about 2 hours? + +# method 1: https://github.com/Dao-AILab/flash-attention/releases +# flash_attn-2.5.0+cu122torch2.1cxx11abiTRUE-cp310-cp310-linux_x86_64.whl +# method 2: +# pip install /root/share/wheels/flash_attn-2.4.2+cu118torch2.0cxx11abiTRUE-cp310-cp310-linux_x86_64.whl + +# mpi4py==3.1.5 # conda install mpi4py From ce2cb5156c9ff7799ece80d151fb2729dfe71342 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Thu, 21 Mar 2024 15:56:54 +0900 Subject: [PATCH 4/7] update data.json (delete 4 empty data) 4 empty lines in data.json 425 483 742 1120 --- datasets/data.json | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/datasets/data.json b/datasets/data.json index 904b04c..d3f0fe4 100644 --- a/datasets/data.json +++ b/datasets/data.json @@ -7552,9 +7552,6 @@ } ] }, - { - "conversation": [] - }, { "conversation": [ { @@ -8540,9 +8537,6 @@ } ] }, - { - "conversation": [] - }, { "conversation": [ { @@ -13389,9 +13383,6 @@ } ] }, - { - "conversation": [] - }, { "conversation": [ { @@ -19973,9 +19964,6 @@ } ] }, - { - "conversation": [] - }, { "conversation": [ { From 085a01eafa794cd69a457d7002fb194276bc2118 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Thu, 21 Mar 2024 16:01:54 +0900 Subject: [PATCH 5/7] add dataset processing codes 1. update process.py for multi_turn_dataset(1 and 2) and data.json, data_pro.json 2. add datasets\processed\process_single_turn_conversation_construction.py for single-turn dataset (1 and 2) 3. add datasets\processed\process_merge.py for these 6 updated dataset in datasets\processed\ --- datasets/processed/process.py | 19 +++++++++-- datasets/processed/process_merge.py | 34 +++++++++++++++++++ ...s_single_turn_conversation_construction.py | 31 +++++++++++++++++ 3 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 datasets/processed/process_merge.py create mode 100644 datasets/processed/process_single_turn_conversation_construction.py diff --git a/datasets/processed/process.py b/datasets/processed/process.py index 46fda23..3d34cb3 100644 --- a/datasets/processed/process.py +++ b/datasets/processed/process.py @@ -1,12 +1,25 @@ import json # 打开JSON文件并读取其内容 -with open('/root/Emollm/datasets/multi_turn_dataset_2.json', 'rt', encoding='utf-8') as file: + +# file_name = 'multi_turn_dataset_1.json' +# file_name = 'multi_turn_dataset_2.json' +# file_name = 'data_pro.json' +file_name = 'data.json' + +with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file: data = json.load(file) n = 0 for i in data: - i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + + try: + i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + except: + print(n,i) # 4 empty lines in data.json 425 483 742 1120 + n+=1 -with open('output2.json', 'wt', encoding='utf-8') as file: +with open(f'processed_{file_name}', 'wt', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) + +print(data[0]) \ No newline at end of file diff --git a/datasets/processed/process_merge.py b/datasets/processed/process_merge.py new file mode 100644 index 0000000..6db4f92 --- /dev/null +++ b/datasets/processed/process_merge.py @@ -0,0 +1,34 @@ +import os +import json + +# 设置目录路径,这里假设你的JSON文件都在当前目录下的json_files文件夹中 +directory_path = './' + +# 初始化一个空列表,用于存储所有JSON文件的数据 +combined_list = [] + +# 遍历指定目录下的所有文件 +for filename in os.listdir(directory_path): + # 检查文件扩展名是否为.json + if filename.endswith('.json'): + # 构建文件的完整路径 + file_path = os.path.join(directory_path, filename) + + # 打开并读取JSON文件 + with open(file_path, 'r', encoding='utf-8') as json_file: + # 加载JSON文件的内容 + data = json.load(json_file) + + # 将读取到的数据添加到combined_list中 + # 假设每个JSON文件包含的是一个列表,如果不是,可以根据实际情况调整 + if isinstance(data, list): + combined_list.extend(data) + else: + combined_list.append(data) + +# 打印合并后的列表 +# print(combined_list) + +# 如果需要,可以将合并后的列表保存到一个新的JSON文件中 +with open('combined_data.json', 'w', encoding='utf-8') as combined_json_file: + json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4) \ No newline at end of file diff --git a/datasets/processed/process_single_turn_conversation_construction.py b/datasets/processed/process_single_turn_conversation_construction.py new file mode 100644 index 0000000..5c419ab --- /dev/null +++ b/datasets/processed/process_single_turn_conversation_construction.py @@ -0,0 +1,31 @@ +import json + +# 打开JSON文件并读取其内容 +# file_name = 'single_turn_dataset_1.json' +file_name = 'single_turn_dataset_2.json' +with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file: + format1_data = json.load(file) + +# n = 0 +# for i in data: +# i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + +system = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" + +# 转换为格式2的数据 +format2_data = [] +for item in format1_data: + conversation = { + "system": system, + "input": item["prompt"], + "output": item["completion"] + } + format2_data.append({"conversation": [conversation]}) + +# 将转换后的数据转换为JSON格式 + + +with open(f'./processed_{file_name}', 'wt', encoding='utf-8') as file: + json.dump(format2_data, file, ensure_ascii=False, indent=4) + +print(format2_data[0]) \ No newline at end of file From d25a304c4d3990bc6d53c95a8f2dfef233321d99 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Thu, 21 Mar 2024 16:06:41 +0900 Subject: [PATCH 6/7] Update process_single_turn_conversation_construction.py --- .../process_single_turn_conversation_construction.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/datasets/processed/process_single_turn_conversation_construction.py b/datasets/processed/process_single_turn_conversation_construction.py index 5c419ab..5dd5a72 100644 --- a/datasets/processed/process_single_turn_conversation_construction.py +++ b/datasets/processed/process_single_turn_conversation_construction.py @@ -6,10 +6,6 @@ file_name = 'single_turn_dataset_2.json' with open(f'/root/StableCascade/emollm2/EmoLLM/datasets/{file_name}', 'rt', encoding='utf-8') as file: format1_data = json.load(file) -# n = 0 -# for i in data: -# i['conversation'][0]['system'] = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" - system = "你是心理健康助手EmoLLM,由EmoLLM团队打造。你旨在通过专业心理咨询,协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术,一步步帮助来访者解决心理问题。" # 转换为格式2的数据 @@ -28,4 +24,4 @@ for item in format1_data: with open(f'./processed_{file_name}', 'wt', encoding='utf-8') as file: json.dump(format2_data, file, ensure_ascii=False, indent=4) -print(format2_data[0]) \ No newline at end of file +print(format2_data[0]) From 4ff7910368b6a8642aa6c00a63e8aa8779f4264a Mon Sep 17 00:00:00 2001 From: HongCheng Date: Thu, 21 Mar 2024 16:07:18 +0900 Subject: [PATCH 7/7] Update process_merge.py --- datasets/processed/process_merge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/processed/process_merge.py b/datasets/processed/process_merge.py index 6db4f92..9cb4265 100644 --- a/datasets/processed/process_merge.py +++ b/datasets/processed/process_merge.py @@ -1,7 +1,7 @@ import os import json -# 设置目录路径,这里假设你的JSON文件都在当前目录下的json_files文件夹中 +# 设置目录路径,这里假设你的JSON文件都在当前目录下的directory_path文件夹中 directory_path = './' # 初始化一个空列表,用于存储所有JSON文件的数据 @@ -26,9 +26,9 @@ for filename in os.listdir(directory_path): else: combined_list.append(data) -# 打印合并后的列表 +# 打印合并后的列表 very large and slow # print(combined_list) # 如果需要,可以将合并后的列表保存到一个新的JSON文件中 with open('combined_data.json', 'w', encoding='utf-8') as combined_json_file: - json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4) \ No newline at end of file + json.dump(combined_list, combined_json_file, ensure_ascii=False, indent=4)