diff --git a/.ipynb_checkpoints/finetune-checkpoint.py b/.ipynb_checkpoints/finetune-checkpoint.py new file mode 100644 index 0000000..0a41e55 --- /dev/null +++ b/.ipynb_checkpoints/finetune-checkpoint.py @@ -0,0 +1,273 @@ +import os +import sys +from typing import List + +import time +import fire +import torch +import transformers +from datasets import load_dataset + +""" +Unused imports: +import torch.nn as nn +import bitsandbytes as bnb +""" + +from peft import ( + LoraConfig, + get_peft_model, + get_peft_model_state_dict, + prepare_model_for_int8_training, + set_peft_model_state_dict, +) +from transformers import LlamaForCausalLM, LlamaTokenizer + +from utils.prompter import Prompter + + +def train( + # model/data params + base_model: str = "", # the only required argument + data_path: str = "YOUR LLM PATH", + output_dir: str = "./lora-alpaca", + # training hyperparams + batch_size: int = 16, + micro_batch_size: int = 16, + num_epochs: int = 2, + learning_rate: float = 3e-4, + cutoff_len: int = 512, + val_set_size: int = 0, + # lora hyperparams + lora_r: int = 16, + lora_alpha: int = 16, + lora_dropout: float = 0.05, + lora_target_modules: List[str] = [ + "q_proj", + "v_proj", + ], + # llm hyperparams + train_on_inputs: bool = True, # if False, masks out inputs in loss + add_eos_token: bool = False, + group_by_length: bool = False, # faster, but produces an odd training loss curve + # wandb params + wandb_project: str = "", + wandb_run_name: str = "", + wandb_watch: str = "", # options: false | gradients | all + wandb_log_model: str = "", # options: false | true + resume_from_checkpoint: str = None, # either training checkpoint or final adapter + prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca. +): + if int(os.environ.get("LOCAL_RANK", 0)) == 0: + print( + f"Training Alpaca-LoRA model with params:\n" + f"base_model: {base_model}\n" + f"data_path: {data_path}\n" + f"output_dir: {output_dir}\n" + f"batch_size: {batch_size}\n" + f"micro_batch_size: {micro_batch_size}\n" + f"num_epochs: {num_epochs}\n" + f"learning_rate: {learning_rate}\n" + f"cutoff_len: {cutoff_len}\n" + f"val_set_size: {val_set_size}\n" + f"lora_r: {lora_r}\n" + f"lora_alpha: {lora_alpha}\n" + f"lora_dropout: {lora_dropout}\n" + f"lora_target_modules: {lora_target_modules}\n" + f"train_on_inputs: {train_on_inputs}\n" + f"add_eos_token: {add_eos_token}\n" + f"group_by_length: {group_by_length}\n" + f"wandb_project: {wandb_project}\n" + f"wandb_run_name: {wandb_run_name}\n" + f"wandb_watch: {wandb_watch}\n" + f"wandb_log_model: {wandb_log_model}\n" + f"resume_from_checkpoint: {resume_from_checkpoint or False}\n" + f"prompt template: {prompt_template_name}\n" + ) + assert ( + base_model + ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'" + gradient_accumulation_steps = batch_size // micro_batch_size + + prompter = Prompter(prompt_template_name) + + device_map = "auto" + world_size = int(os.environ.get("WORLD_SIZE", 1)) + ddp = world_size != 1 + if ddp: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} + gradient_accumulation_steps = gradient_accumulation_steps // world_size + + + model = LlamaForCausalLM.from_pretrained( + base_model, + # load_in_8bit=True, + torch_dtype=torch.float16, + device_map=device_map, + ) + + tokenizer = LlamaTokenizer.from_pretrained(base_model) + + tokenizer.pad_token_id = ( + 0 # unk. we want this to be different from the eos token + ) + tokenizer.padding_side = "left" # Allow batched inference + + def tokenize(prompt, add_eos_token=True): + # there's probably a way to do this with the tokenizer settings + # but again, gotta move fast + result = tokenizer( + prompt, + truncation=True, + max_length=cutoff_len, + padding=False, + return_tensors=None, + ) + if ( + result["input_ids"][-1] != tokenizer.eos_token_id + and len(result["input_ids"]) < cutoff_len + and add_eos_token + ): + result["input_ids"].append(tokenizer.eos_token_id) + result["attention_mask"].append(1) + + result["labels"] = result["input_ids"].copy() + + return result + + def generate_and_tokenize_prompt(data_point): + full_prompt = prompter.generate_prompt( + data_point["instruction"], + data_point["input"], + data_point["output"], + ) + tokenized_full_prompt = tokenize(full_prompt) + if not train_on_inputs: + user_prompt = prompter.generate_prompt( + data_point["instruction"], data_point["input"] + ) + tokenized_user_prompt = tokenize( + user_prompt, add_eos_token=add_eos_token + ) + user_prompt_len = len(tokenized_user_prompt["input_ids"]) + + if add_eos_token: + user_prompt_len -= 1 + + tokenized_full_prompt["labels"] = [ + -100 + ] * user_prompt_len + tokenized_full_prompt["labels"][ + user_prompt_len: + ] # could be sped up, probably + return tokenized_full_prompt + + model = prepare_model_for_int8_training(model) + + config = LoraConfig( + r=lora_r, + lora_alpha=lora_alpha, + target_modules=lora_target_modules, + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + if data_path.endswith(".json") or data_path.endswith(".jsonl"): + data = load_dataset("json", data_files=data_path) + else: + data = load_dataset(data_path) + + if resume_from_checkpoint: + # Check the available weights and load them + checkpoint_name = os.path.join( + resume_from_checkpoint, "pytorch_model.bin" + ) # Full checkpoint + if not os.path.exists(checkpoint_name): + checkpoint_name = os.path.join( + resume_from_checkpoint, "adapter_model.bin" + ) # only LoRA model - LoRA config above has to fit + resume_from_checkpoint = ( + False # So the trainer won't try loading its state + ) + # The two files above have a different name depending on how they were saved, but are actually the same. + if os.path.exists(checkpoint_name): + print(f"Restarting from {checkpoint_name}") + adapters_weights = torch.load(checkpoint_name) + set_peft_model_state_dict(model, adapters_weights) + else: + print(f"Checkpoint {checkpoint_name} not found") + + model.print_trainable_parameters() # Be more transparent about the % of trainable params. + + if val_set_size > 0: + train_val = data["train"].train_test_split( + test_size=val_set_size, shuffle=True, seed=42 + ) + train_data = ( + train_val["train"].shuffle().map(generate_and_tokenize_prompt) + ) + val_data = ( + train_val["test"].shuffle().map(generate_and_tokenize_prompt) + ) + else: + train_data = data["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = None + + if not ddp and torch.cuda.device_count() > 1: + # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available + model.is_parallelizable = True + model.model_parallel = True + + trainer = transformers.Trainer( + model=model, + train_dataset=train_data, + eval_dataset=val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=micro_batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + warmup_steps=100, + num_train_epochs=num_epochs, + learning_rate=learning_rate, + fp16=True, + logging_steps=10, + optim="adamw_torch", + evaluation_strategy="steps" if val_set_size > 0 else "no", + save_strategy="steps", + eval_steps=None, + save_steps=8000, + output_dir=output_dir, + save_total_limit=2, + load_best_model_at_end=True if val_set_size > 0 else False, + ddp_find_unused_parameters=False if ddp else None, + group_by_length=group_by_length, + report_to=None, + run_name=None, + ), + data_collator=transformers.DataCollatorForSeq2Seq( + tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True + ), + ) + model.config.use_cache = False + + old_state_dict = model.state_dict + model.state_dict = ( + lambda self, *_, **__: get_peft_model_state_dict( + self, old_state_dict() + ) + ).__get__(model, type(model)) + + if torch.__version__ >= "2" and sys.platform != "win32": + model = torch.compile(model) + + trainer.train(resume_from_checkpoint=resume_from_checkpoint) + + model.save_pretrained(output_dir) + + print( + "\n If there's a warning about missing keys above, please disregard :)" + ) + + +if __name__ == "__main__": + fire.Fire(train) diff --git a/.ipynb_checkpoints/finetune_kopa-checkpoint.py b/.ipynb_checkpoints/finetune_kopa-checkpoint.py new file mode 100644 index 0000000..7066f3f --- /dev/null +++ b/.ipynb_checkpoints/finetune_kopa-checkpoint.py @@ -0,0 +1,476 @@ +import os +import sys +from typing import List + +import fire +import torch +import transformers +from datasets import load_dataset + +from kopa import KoPAWithAdapter + +""" +Unused imports: +import torch.nn as nn +import bitsandbytes as bnb +""" + +from peft import PrefixTuningConfig, get_peft_model +from transformers import AutoModelForCausalLM, AutoTokenizer + +from utils.prompter import Prompter + + +def custom_collate_fn(batch): + input_ids_list = [] + attention_mask_list = [] + static_prefix_list = [] + sensor_data_list = [] + # qwen_dict= {'llama_eos_tid':, 'qwen_eos_tid':} + + for b in batch: + # 确保输入是张量 + if isinstance(b["input_ids"], list): + input_ids = torch.tensor(b["input_ids"], dtype=torch.long) + else: + input_ids = b["input_ids"] + input_ids_list.append(input_ids) + + if isinstance(b["attention_mask"], list): + attention_mask = torch.tensor(b["attention_mask"], dtype=torch.long) + else: + attention_mask = b["attention_mask"] + attention_mask_list.append(attention_mask) + + if "static_prefix" in b: + if isinstance(b["static_prefix"], list): + static_prefix = torch.tensor(b["static_prefix"], dtype=torch.long) + else: + static_prefix = b["static_prefix"] + static_prefix_list.append(static_prefix) + + if "sensor_data" in b: + if isinstance(b["sensor_data"], list): + sensor_data = torch.tensor(b["sensor_data"], dtype=torch.float) + else: + sensor_data = b["sensor_data"] + sensor_data_list.append(sensor_data) + max_length=0 + for one_inputs in input_ids_list: + max_length = one_inputs.size(0) if max_length < one_inputs.size(0) else max_length + input_ids_list_=list() + for one_inputs in input_ids_list: + input_ids_list_.append(torch.cat((one_inputs, torch.full((max_length-one_inputs.size(0),), 0, dtype=torch.int)), dim=-1)) + + + attention_mask_list_=list() + for mask in attention_mask_list: + attention_mask_list_.append(torch.cat((mask, torch.full((max_length-mask.size(0),), 0, dtype=torch.int)), dim=-1)) + + # print("=====",input_ids_list) + # exit(0) + + # 堆叠数据 + result = { + "input_ids": torch.stack(input_ids_list_), + "attention_mask": torch.stack(attention_mask_list_), + } + + if static_prefix_list: + result["static_prefix"] = torch.stack(static_prefix_list) + + if sensor_data_list: + result["sensor_data"] = torch.stack(sensor_data_list) + + if "labels" in batch[0]: + labels_list = [] + for b in batch: + if isinstance(b["labels"], list): + labels = torch.tensor(b["labels"], dtype=torch.long) + else: + labels = b["labels"] + labels_list.append(labels) + labels_list_=list() + for label in labels_list: + labels_list_.append(torch.cat((label, torch.full((max_length-label.size(0),), 0, dtype=torch.int)), dim=-1)) + + + result["labels"] = torch.stack(labels_list_) + + return result + + +def train( + # model/data params + base_model="/root/shared-nvme/models/Qwen2.5-7B-Instruct", + data_path: str = "/root/shared-nvme/dataset/olive_dataset.json", + output_dir: str = "output", + # training hyperparams + batch_size: int = 16, + micro_batch_size: int = 16, + num_epochs: int = 2, + learning_rate: float = 1e-4, + cutoff_len: int = 512, + val_set_size: int = 0, + num_prefix: int = 1, + # llm hyperparams + train_on_inputs: bool = True, # if False, masks out inputs in loss + add_eos_token: bool = False, + group_by_length: bool = False, # faster, but produces an odd training loss curve + # wandb params + wandb_project: str = "", + wandb_run_name: str = "", + wandb_watch: str = "", # options: false | gradients | all + wandb_log_model: str = "", # options: false | true + resume_from_checkpoint: str = None, # either training checkpoint or final adapter + prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca. +): + if int(os.environ.get("LOCAL_RANK", 0)) == 0: + print( + f"Training Alpaca model with params:\n" + f"base_model: {base_model}\n" + f"data_path: {data_path}\n" + f"output_dir: {output_dir}\n" + f"batch_size: {batch_size}\n" + f"micro_batch_size: {micro_batch_size}\n" + f"num_epochs: {num_epochs}\n" + f"learning_rate: {learning_rate}\n" + f"cutoff_len: {cutoff_len}\n" + f"val_set_size: {val_set_size}\n" + f"train_on_inputs: {train_on_inputs}\n" + f"add_eos_token: {add_eos_token}\n" + f"group_by_length: {group_by_length}\n" + f"wandb_project: {wandb_project}\n" + f"wandb_run_name: {wandb_run_name}\n" + f"wandb_watch: {wandb_watch}\n" + f"wandb_log_model: {wandb_log_model}\n" + f"resume_from_checkpoint: {resume_from_checkpoint or False}\n" + f"prompt template: {prompt_template_name}\n" + ) + assert ( + base_model + ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'" + gradient_accumulation_steps = batch_size // micro_batch_size + + prompter = Prompter(prompt_template_name) + + device_map = "auto" + world_size = int(os.environ.get("WORLD_SIZE", 1)) + ddp = world_size != 1 + if ddp: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} + gradient_accumulation_steps = gradient_accumulation_steps // world_size + + model = AutoModelForCausalLM.from_pretrained( + base_model, + load_in_8bit=True, + # 使用Auto类自动选择正确的模型类型 + torch_dtype=torch.float16, + device_map=device_map, + trust_remote_code=True, # Qwen模型需要此参数 + ) + + tokenizer = AutoTokenizer.from_pretrained( + base_model, + trust_remote_code=True, # 添加此参数 + padding_side="left", # Qwen也推荐左侧填充 + ) + tokenizer.pad_token = tokenizer.eos_token + + + + # tokenizer.pad_token_id = ( + # 0 # unk. we want this to be different from the eos token + # ) + # tokenizer.padding_side = "left" # Allow batched inference + # model.gradient_checkpointing_enable() + # tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + model.generation_config.pad_token_id = model.generation_config.eos_token_id + + def ensure_consistent_keys(dataset): + all_keys = set() + for example in dataset: + all_keys.update(example.keys()) + + for example in dataset: + for key in all_keys: + if key not in example: + if key == "static_prefix": + example[key] = "" + elif key == "sensor_data": + example[key] = [0, 0, 0] + + return dataset + + # def tokenize(prompt, add_eos_token=True): + # # there's probably a way to do this with the tokenizer settings + # # but again, gotta move fast + # result = tokenizer( + # prompt, + # truncation=True, + # max_length=cutoff_len, + # padding=False, + # return_tensors=None, + # ) + # if ( + # result["input_ids"][-1] != tokenizer.eos_token_id + # and len(result["input_ids"]) < cutoff_len + # and add_eos_token + # ): + # result["input_ids"].append(tokenizer.eos_token_id) + # result["attention_mask"].append(1) + # + # result["labels"] = result["input_ids"].copy() + # + # return result + + def generate_and_tokenize_prompt(data_point): + full_prompt = prompter.generate_prompt( + data_point["instruction"], + data_point["input"], + data_point["output"], + ) + + # Tokenizer 处理文本 + tokenized_full_prompt = tokenizer( + full_prompt, + truncation=True, + max_length=cutoff_len, + padding=True, + return_tensors='pt', + ) + # for k,v in tokenized_full_prompt.items(): print("======k,v",k,v,type(k),type(v)) + + # exit(0) + + + tokenized_full_prompt = {k: v.squeeze(0) for k, v in tokenized_full_prompt.items()} + + # 处理静态前缀 + static_prefix = tokenizer( + data_point["instruction"], + truncation=True, + max_length=10, + padding="max_length", + return_tensors="pt" + )["input_ids"].squeeze(0) + + # 限制索引范围,确保 `static_prefix` 不会超出 `vocab_size` + static_prefix = torch.clamp(static_prefix, min=0, max=tokenizer.vocab_size - 1) + + tokenized_full_prompt["static_prefix"] = static_prefix + # print(f"[DEBUG] static_prefix (after clamp): {static_prefix}") + # print(f"[DEBUG] tokenizer vocab_size: {tokenizer.vocab_size}") + + # **处理动态数据** + sensor_values = torch.zeros(3, dtype=torch.float) # **默认值为 Tensor,而不是 list** + + if data_point["type"] == "dynamic" and "sensor_data" in data_point: + raw_sensor_values = data_point["sensor_data"] + + try: + sensor_values = torch.tensor([ + float(raw_sensor_values.get("temperature", 0.0)), + float(raw_sensor_values.get("humidity", 0.0)), + float(raw_sensor_values.get("conductivity", 0.0)) + ], dtype=torch.float) + except Exception as e: + # print(f"[ERROR] sensor_data 解析错误: {raw_sensor_values}, {e}") + if torch.isnan(sensor_values).any() or torch.isinf(sensor_values).any(): + # print(f"[ERROR] NaN/Inf detected in sensor_values: {sensor_values}") + sensor_values = torch.zeros(3, dtype=torch.float) + + # ✅ 确保 sensor_values 是 `Tensor` + if torch.isnan(sensor_values).any() or torch.isinf(sensor_values).any(): + print(f"[ERROR] NaN/Inf detected in sensor_values") + if torch.isnan(sensor_values).any() or torch.isinf(sensor_values).any(): + print(f"[ERROR] NaN/Inf detected in sensor_values") + sensor_values = torch.zeros(3, dtype=torch.float) + + # 限制范围,防止异常值 + sensor_values = torch.clamp(sensor_values, min=-100, max=100) + + # print(f"[DEBUG] sensor_values (AFTER FIX): {sensor_values}") # 🔥 打印调试信息 + if not isinstance(sensor_values, torch.Tensor): + sensor_values = torch.tensor(sensor_values, dtype=torch.float) + + tokenized_full_prompt["sensor_data"] = sensor_values # **确保始终是 Tensor** + + # 最后增加类型检查和转换 + for key in tokenized_full_prompt: + if isinstance(tokenized_full_prompt[key], list): + # Convert lists to tensors + tokenized_full_prompt[key] = torch.tensor(tokenized_full_prompt[key]) + elif isinstance(tokenized_full_prompt[key], torch.Tensor) and tokenized_full_prompt[key].dim() > 1: + # Squeeze extra dimensions if needed + tokenized_full_prompt[key] = tokenized_full_prompt[key].squeeze(0) + + if key in ["input_ids", "attention_mask"] and isinstance(tokenized_full_prompt[key], list): + tokenized_full_prompt[key] = torch.tensor(tokenized_full_prompt[key], dtype=torch.long) + + if isinstance(tokenized_full_prompt["static_prefix"], list): + tokenized_full_prompt["static_prefix"] = torch.tensor(tokenized_full_prompt["static_prefix"], + dtype=torch.long) + + # 确保sensor_data是tensor + if not isinstance(tokenized_full_prompt["sensor_data"], torch.Tensor): + tokenized_full_prompt["sensor_data"] = torch.tensor(tokenized_full_prompt["sensor_data"], dtype=torch.float) + + tokenized_full_prompt["labels"] = tokenized_full_prompt["input_ids"].clone() + + # 如果不想对输入部分计算损失,可以将输入部分的标签设为-100 + if not train_on_inputs: + # 找到用户输入和助手输出的分界点 + sep = tokenizer.encode(prompter.separator) + instruction_tokens = tokenizer.encode(data_point["instruction"]) + + # 将用户输入部分的标签设为-100 + sep_pos = tokenized_full_prompt["input_ids"].tolist().index(sep[0]) + tokenized_full_prompt["labels"][:sep_pos] = -100 + + return tokenized_full_prompt + + + # 创建PrefixTuning配置 + + prefix_config = PrefixTuningConfig( + num_virtual_tokens=num_prefix, + task_type="CAUSAL_LM" + ) + + # 创建PEFT模型 + peft_model = get_peft_model(model, prefix_config) + + + # 创建最终的KoPAWithAdapter模型 + final_model = KoPAWithAdapter(peft_model, num_prefix, tokenizer) + device = next(model.parameters()).device + print(f"[INFO] 使用设备: {device}") + + # 确保final_model及其组件都在相同设备上 + final_model = final_model.to(device) + + + if data_path.endswith(".json") or data_path.endswith(".jsonl"): + data = load_dataset("json", data_files=data_path) + else: + data = load_dataset(data_path) + + if resume_from_checkpoint: + # Check the available weights and load them + checkpoint_name = os.path.join( + resume_from_checkpoint, "pytorch_model.bin" + ) # Full checkpoint + if not os.path.exists(checkpoint_name): + checkpoint_name = os.path.join( + resume_from_checkpoint, "adapter_model.bin" + ) # only LoRA model - LoRA config above has to fit + resume_from_checkpoint = ( + False # So the trainer won't try loading its state + ) + # The two files above have a different name depending on how they were saved, but are actually the same. + if os.path.exists(checkpoint_name): + print(f"Restarting from {checkpoint_name}") + adapters_weights = torch.load(checkpoint_name) + else: + print(f"Checkpoint {checkpoint_name} not found") + + # model.print_trainable_parameters() # Be more transparent about the % of trainable params. + + if val_set_size > 0: + train_val = data["train"].train_test_split( + test_size=val_set_size, shuffle=True, seed=42 + ) + train_data = ( + train_val["train"].shuffle().map(generate_and_tokenize_prompt) + + ) + train_data = ensure_consistent_keys(train_data) + val_data = ( + train_val["test"].shuffle().map(generate_and_tokenize_prompt) + ) + else: + train_data = data["train"].shuffle().map(generate_and_tokenize_prompt) + train_data = ensure_consistent_keys(train_data) + val_data = None + + if not ddp and torch.cuda.device_count() > 1: + # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available + model.is_parallelizable = True + model.model_parallel = True + + trainer = transformers.Trainer( + model=final_model, + data_collator=custom_collate_fn, + train_dataset=train_data, + eval_dataset=val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=micro_batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + warmup_steps=100, + num_train_epochs=num_epochs, + learning_rate=learning_rate, + fp16=True, + logging_steps=10, + optim="adamw_hf", + evaluation_strategy="steps" if val_set_size > 0 else "no", + save_strategy="steps", + eval_steps=None, + save_steps=5000, + output_dir=output_dir, + save_total_limit=2, + load_best_model_at_end=True if val_set_size > 0 else False, + ddp_find_unused_parameters=False if ddp else None, + group_by_length=group_by_length, + report_to=None, + run_name=None, + ), + ) + # final_model.config.use_cache = False + + if torch.__version__ >= "2" and sys.platform != "win32": + final_model = torch.compile(model) + + trainer.train(resume_from_checkpoint=resume_from_checkpoint) + + final_model.save_pretrained(output_dir) + + # ⭐ 确保embeddings存在再保存 + if hasattr(final_model, "embeddings"): + torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth")) + else: + print("[WARNING] final_model没有embeddings属性,跳过保存。") + + try: + final_model.model.save_pretrained(os.path.join(output_dir, "peft_model")) + print(f"[INFO] PEFT模型保存到 {os.path.join(output_dir, 'peft_model')}") + except Exception as e: + print(f"[WARNING] 保存PEFT模型时出错: {e}") + +def inspect_model_structure(model): + """检查模型结构并打印关键层信息""" + print(f"Model type: {type(model).__name__}") + print(f"Model config: {model.config.__class__.__name__}") + + # 检查嵌入层 + embedding_layers = [] + for name, module in model.named_modules(): + if any(key in name for key in ['embed', 'wte', 'word_embeddings']): + embedding_layers.append((name, type(module).__name__)) + if hasattr(module, 'weight'): + print(f"Layer {name}: shape {module.weight.shape}") + + print(f"Found {len(embedding_layers)} potential embedding layers:") + for name, type_name in embedding_layers: + print(f" - {name}: {type_name}") + + # 检查注意力层 + print("\nAttention structure:") + for name, module in model.named_modules(): + if 'attention' in name.lower(): + print(f" - {name}: {type(module).__name__}") + + +if __name__ == "__main__": + fire.Fire(train) diff --git a/.ipynb_checkpoints/kopa-checkpoint.py b/.ipynb_checkpoints/kopa-checkpoint.py new file mode 100644 index 0000000..ea362ed --- /dev/null +++ b/.ipynb_checkpoints/kopa-checkpoint.py @@ -0,0 +1,297 @@ +import torch +import torch.nn as nn +from typing import Optional, List, Union, Tuple + +from transformers import LlamaForCausalLM + + +class KoPA(nn.Module): + def __init__( + self, + model + ) -> None: + super(KoPA, self).__init__() + self.llama_model = model + for param in self.model.parameters(): + param.requires_grad = False + + # Only keep gradients for the adapter parts + self.num_prefix = num_prefix + hidden_size = model.config.hidden_size + self.embeddings = nn.Embedding(100, 4096) + for param in model.parameters(): + param.requires_grad = False + + # Only enable gradients for adapter components + self.static_prefix_embedding.requires_grad_(True) + self.sensor_mlp.requires_grad_(True) + self.norm.requires_grad_(True) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + embedding_ids: torch.LongTensor = None + ): + if embedding_ids.max() >= self.embeddings.num_embeddings or embedding_ids.min() < 0: + print(f"[ERROR] embedding_ids 超出范围!最大值: {embedding_ids.max()}, 最小值: {embedding_ids.min()}") + embedding_ids = torch.clamp(embedding_ids, min=0, max=self.embeddings.num_embeddings - 1) + kg_embeds = self.embeddings(embedding_ids) + batch_size, seq_len, _ = kg_embeds.shape + if hasattr(self.llama_model, 'transformer'): + # Qwen模型 + token_embeds = self.llama_model.transformer.wte(input_ids) + elif hasattr(self.llama_model, 'model') and hasattr(self.llama_model.model, 'embed_tokens'): + # 原始路径 + token_embeds = self.llama_model.model.model.embed_tokens(input_ids) + else: + # 添加调试代码 + print("无法找到模型嵌入层,尝试检测模型结构...") + raise ValueError("模型结构不兼容") + input_embeds = torch.cat((kg_embeds, token_embeds), dim=1) + prefix_mask = torch.ones((batch_size, seq_len)) + prefix_labels = torch.full((batch_size, seq_len), fill_value=-100, dtype=torch.long) + new_attention_mask = torch.cat((prefix_mask.cuda(), attention_mask), dim=-1) + new_labels = torch.cat((prefix_labels.cuda(), labels), dim=-1) + if embedding_ids.max() >= self.embeddings.num_embeddings or embedding_ids.min() < 0: + print(f"[ERROR] embedding_ids 超出范围!最大值: {embedding_ids.max()}, 最小值: {embedding_ids.min()}") + embedding_ids = torch.clamp(embedding_ids, min=0, max=self.embeddings.num_embeddings - 1) + + return self.llama_model( + input_ids=None, + attention_mask=new_attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=input_embeds, + labels=new_labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class KoPAWithAdapter(nn.Module): + def __init__(self, model, num_prefix, tokenizer=None): + super().__init__() + self.model = model + self.num_prefix = num_prefix + hidden_size = model.config.hidden_size + + # 打印模型信息以便调试 + print(f"[INFO] 初始化KoPAWithAdapter,模型类型: {type(model).__name__}") + + # 使用tokenizer获取vocab_size + vocab_size = tokenizer.vocab_size if tokenizer else 151936 # Qwen2.5的默认词表大小 + print(f"[INFO] 使用词表大小: {vocab_size}") + + self.static_prefix_embedding = nn.Embedding(vocab_size, hidden_size) + self.embeddings = self.static_prefix_embedding # 保留这个属性 + + self.sensor_mlp = nn.Sequential( + nn.Linear(3, hidden_size // 2), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size // 2, hidden_size) + ) + + # 添加LayerNorm + self.norm = nn.LayerNorm(hidden_size) + + print(f"[INFO] 模型初始化: hidden_size={hidden_size}, vocab_size={vocab_size}") + + # 检测模型嵌入层路径 + self._detect_embedding_path() + + def _detect_embedding_path(self): + """检测模型的嵌入层路径""" + self.embedding_path = None + + # 尝试不同的常见路径 + if hasattr(self.model, 'transformer') and hasattr(self.model.transformer, 'wte'): + self.embedding_path = "transformer.wte" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'embed_tokens'): + self.embedding_path = "model.embed_tokens" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(self.model.model.model, 'embed_tokens'): + self.embedding_path = "model.model.model.embed_tokens" + + if self.embedding_path: + print(f"[INFO] 检测到嵌入层路径: {self.embedding_path}") + else: + print("[WARNING] 无法自动检测嵌入层路径,将在前向传播中尝试多种路径") + + def forward(self, input_ids, attention_mask, static_prefix=None, sensor_data=None, labels=None, **kwargs): + batch_size, seq_len = input_ids.shape + device = input_ids.device + + # 确保所有组件在同一设备上 + self.static_prefix_embedding = self.static_prefix_embedding.to(device) + self.sensor_mlp = self.sensor_mlp.to(device) + self.norm = self.norm.to(device) + + # 处理静态前缀 + if static_prefix is not None: + static_prefix = static_prefix.to(device) + static_prefix = self.static_prefix_embedding(static_prefix) + else: + static_prefix = torch.zeros( + (batch_size, self.num_prefix, self.model.config.hidden_size), + device=device + ) + + # 处理动态前缀 + if sensor_data is not None: + sensor_data = sensor_data.to(device) + + if sensor_data.dim() == 1: + sensor_data = sensor_data.unsqueeze(0) + + try: + dynamic_prefix = self.sensor_mlp(sensor_data) + dynamic_prefix = dynamic_prefix.unsqueeze(1).expand(-1, self.num_prefix, -1) + except Exception as e: + print(f"[ERROR] sensor_mlp处理失败: {e}") + dynamic_prefix = torch.zeros_like(static_prefix) + else: + dynamic_prefix = torch.zeros_like(static_prefix) + + # 混合前缀 + alpha = 0.6 + final_prefix = alpha * static_prefix + (1 - alpha) * dynamic_prefix + final_prefix = self.norm(final_prefix) + + # 处理token嵌入 - 根据检测到的路径获取嵌入 + try: + if self.embedding_path == "transformer.wte": + token_embeds = self.model.transformer.wte(input_ids) + elif self.embedding_path == "model.embed_tokens": + token_embeds = self.model.model.embed_tokens(input_ids) + elif self.embedding_path == "model.model.model.embed_tokens": + token_embeds = self.model.model.model.embed_tokens(input_ids) + else: + # 尝试多种可能的路径 + if hasattr(self.model, 'transformer') and hasattr(self.model.transformer, 'wte'): + token_embeds = self.model.transformer.wte(input_ids) + self.embedding_path = "transformer.wte" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'embed_tokens'): + token_embeds = self.model.model.embed_tokens(input_ids) + self.embedding_path = "model.embed_tokens" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(self.model.model.model, 'embed_tokens'): + token_embeds = self.model.model.model.embed_tokens(input_ids) + self.embedding_path = "model.model.model.embed_tokens" + else: + raise ValueError("无法找到嵌入层路径") + print(f"[INFO] 成功找到嵌入层路径: {self.embedding_path}") + except Exception as e: + print(f"[ERROR] 获取token嵌入失败: {e}") + # 打印模型结构以帮助调试 + print("模型结构:") + for name, _ in self.model.named_modules(): + if 'embed' in name or 'wte' in name: + print(f" - {name}") + raise + + input_embeds = torch.cat((final_prefix, token_embeds), dim=1) + + # 扩展注意力掩码 + prefix_attention_mask = torch.ones( + (batch_size, self.num_prefix), + dtype=attention_mask.dtype, + device=device + ) + extended_attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + # 处理标签 + if labels is not None: + # 为前缀部分创建-100的标签(表示忽略) + prefix_labels = torch.full( + (batch_size, self.num_prefix), + fill_value=-100, # -100表示忽略这些位置的损失 + dtype=labels.dtype, + device=device + ) + # 扩展标签 + extended_labels = torch.cat((prefix_labels, labels), dim=1) + else: + extended_labels = None + + # 确保不提供input_ids + if 'input_ids' in kwargs: + del kwargs['input_ids'] + + # 传递扩展后的标签 + return self.model( + inputs_embeds=input_embeds, + attention_mask=extended_attention_mask, + labels=extended_labels, + use_cache=False, + **kwargs) + +# class PrefixKGEmbedding(nn.Module): +# def __init__( +# self, +# num_ent, +# num_rel, +# dim_llm, +# num_prefix +# ): +# super(PrefixKGEmbedding, self).__init__() +# self.emb_dim = num_prefix * dim_llm +# self.ent_embeddings = nn.Embedding(num_ent, self.emb_dim) +# self.rel_embeddings = nn.Embedding(num_rel, self.emb_dim) +# +# +# def forward(self, triple_ids): +# head, relation, tail = triple_ids[:, 0], triple_ids[:, 1], triple_ids[:, 2] +# h = self.ent_embeddings(head) +# r = self.rel_embeddings(relation) +# t = self.ent_embeddings(tail) +# prefix = torch.stack((h, r, t), dim=1) +# return prefix + +class PretrainKGEmbedding(nn.Module): + def __init__( + self, + pretrain_ent_embs, + pretrain_rel_embs, + dim_llm, + num_prefix + ): + super(PretrainKGEmbedding, self).__init__() + self.num_prefix = num_prefix + self.llm_dim = dim_llm + self.emb_dim = num_prefix * dim_llm + self.ent_embeddings = nn.Embedding.from_pretrained(pretrain_ent_embs) + self.rel_embeddings = nn.Embedding.from_pretrained(pretrain_rel_embs) + self.pretrain_dim = self.ent_embeddings.weight.shape[1] + # Froze the pretrain embeddings + self.ent_embeddings.requires_grad_(False) + self.rel_embeddings.requires_grad_(False) + self.adapter = nn.Linear(self.pretrain_dim, self.emb_dim) + + + def forward(self, triple_ids): + # main training stage + if triple_ids.shape[1] == 3: + head, relation, tail = triple_ids[:, 0], triple_ids[:, 1], triple_ids[:, 2] + h = self.ent_embeddings(head) + r = self.rel_embeddings(relation) + t = self.ent_embeddings(tail) + pretrain_embs = torch.stack((h, r, t), dim=1) + prefix = self.adapter(pretrain_embs).reshape(-1, 3*self.num_prefix, self.llm_dim) + return prefix + # entity-aware pre-funing + else: + ent = triple_ids.reshape(-1,) + emb = self.ent_embeddings(ent) + prefix = self.adapter(emb).reshape(-1, self.num_prefix, self.llm_dim) + # print(prefix.shape) + return prefix + diff --git a/__pycache__/kopa.cpython-39.pyc b/__pycache__/kopa.cpython-39.pyc new file mode 100644 index 0000000..db7509a Binary files /dev/null and b/__pycache__/kopa.cpython-39.pyc differ diff --git a/finetune_kopa.py b/finetune_kopa.py index a92ec5b..7066f3f 100644 --- a/finetune_kopa.py +++ b/finetune_kopa.py @@ -16,7 +16,7 @@ import bitsandbytes as bnb """ from peft import PrefixTuningConfig, get_peft_model -from transformers import LlamaForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from utils.prompter import Prompter @@ -26,6 +26,7 @@ def custom_collate_fn(batch): attention_mask_list = [] static_prefix_list = [] sensor_data_list = [] + # qwen_dict= {'llama_eos_tid':, 'qwen_eos_tid':} for b in batch: # 确保输入是张量 @@ -54,11 +55,25 @@ def custom_collate_fn(batch): else: sensor_data = b["sensor_data"] sensor_data_list.append(sensor_data) + max_length=0 + for one_inputs in input_ids_list: + max_length = one_inputs.size(0) if max_length < one_inputs.size(0) else max_length + input_ids_list_=list() + for one_inputs in input_ids_list: + input_ids_list_.append(torch.cat((one_inputs, torch.full((max_length-one_inputs.size(0),), 0, dtype=torch.int)), dim=-1)) + + + attention_mask_list_=list() + for mask in attention_mask_list: + attention_mask_list_.append(torch.cat((mask, torch.full((max_length-mask.size(0),), 0, dtype=torch.int)), dim=-1)) + + # print("=====",input_ids_list) + # exit(0) # 堆叠数据 result = { - "input_ids": torch.stack(input_ids_list), - "attention_mask": torch.stack(attention_mask_list), + "input_ids": torch.stack(input_ids_list_), + "attention_mask": torch.stack(attention_mask_list_), } if static_prefix_list: @@ -75,22 +90,26 @@ def custom_collate_fn(batch): else: labels = b["labels"] labels_list.append(labels) + labels_list_=list() + for label in labels_list: + labels_list_.append(torch.cat((label, torch.full((max_length-label.size(0),), 0, dtype=torch.int)), dim=-1)) + - result["labels"] = torch.stack(labels_list) + result["labels"] = torch.stack(labels_list_) return result def train( # model/data params - base_model="models/Llama-3.2-3B-Instruct", - data_path: str = "data/CoDeX-S-train.json", + base_model="/root/shared-nvme/models/Qwen2.5-7B-Instruct", + data_path: str = "/root/shared-nvme/dataset/olive_dataset.json", output_dir: str = "output", # training hyperparams batch_size: int = 16, micro_batch_size: int = 16, num_epochs: int = 2, - learning_rate: float = 3e-4, + learning_rate: float = 1e-4, cutoff_len: int = 512, val_set_size: int = 0, num_prefix: int = 1, @@ -142,25 +161,30 @@ def train( device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} gradient_accumulation_steps = gradient_accumulation_steps // world_size - model = LlamaForCausalLM.from_pretrained( + model = AutoModelForCausalLM.from_pretrained( base_model, - # load_in_8bit=True, + load_in_8bit=True, + # 使用Auto类自动选择正确的模型类型 torch_dtype=torch.float16, device_map=device_map, + trust_remote_code=True, # Qwen模型需要此参数 ) - tokenizer = AutoTokenizer.from_pretrained(base_model) - - + tokenizer = AutoTokenizer.from_pretrained( + base_model, + trust_remote_code=True, # 添加此参数 + padding_side="left", # Qwen也推荐左侧填充 + ) + tokenizer.pad_token = tokenizer.eos_token # tokenizer.pad_token_id = ( # 0 # unk. we want this to be different from the eos token # ) - tokenizer.padding_side = "left" # Allow batched inference - - tokenizer.pad_token = tokenizer.eos_token + # tokenizer.padding_side = "left" # Allow batched inference + # model.gradient_checkpointing_enable() + # tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id model.generation_config.pad_token_id = model.generation_config.eos_token_id @@ -212,10 +236,14 @@ def train( tokenized_full_prompt = tokenizer( full_prompt, truncation=True, - max_length=128, - padding="max_length", - return_tensors="pt", + max_length=cutoff_len, + padding=True, + return_tensors='pt', ) + # for k,v in tokenized_full_prompt.items(): print("======k,v",k,v,type(k),type(v)) + + # exit(0) + tokenized_full_prompt = {k: v.squeeze(0) for k, v in tokenized_full_prompt.items()} @@ -233,7 +261,7 @@ def train( tokenized_full_prompt["static_prefix"] = static_prefix # print(f"[DEBUG] static_prefix (after clamp): {static_prefix}") - print(f"[DEBUG] tokenizer vocab_size: {tokenizer.vocab_size}") + # print(f"[DEBUG] tokenizer vocab_size: {tokenizer.vocab_size}") # **处理动态数据** sensor_values = torch.zeros(3, dtype=torch.float) # **默认值为 Tensor,而不是 list** @@ -263,7 +291,7 @@ def train( # 限制范围,防止异常值 sensor_values = torch.clamp(sensor_values, min=-100, max=100) - print(f"[DEBUG] sensor_values (AFTER FIX): {sensor_values}") # 🔥 打印调试信息 + # print(f"[DEBUG] sensor_values (AFTER FIX): {sensor_values}") # 🔥 打印调试信息 if not isinstance(sensor_values, torch.Tensor): sensor_values = torch.tensor(sensor_values, dtype=torch.float) @@ -271,6 +299,13 @@ def train( # 最后增加类型检查和转换 for key in tokenized_full_prompt: + if isinstance(tokenized_full_prompt[key], list): + # Convert lists to tensors + tokenized_full_prompt[key] = torch.tensor(tokenized_full_prompt[key]) + elif isinstance(tokenized_full_prompt[key], torch.Tensor) and tokenized_full_prompt[key].dim() > 1: + # Squeeze extra dimensions if needed + tokenized_full_prompt[key] = tokenized_full_prompt[key].squeeze(0) + if key in ["input_ids", "attention_mask"] and isinstance(tokenized_full_prompt[key], list): tokenized_full_prompt[key] = torch.tensor(tokenized_full_prompt[key], dtype=torch.long) @@ -296,6 +331,7 @@ def train( return tokenized_full_prompt + # 创建PrefixTuning配置 prefix_config = PrefixTuningConfig( @@ -412,6 +448,29 @@ def train( except Exception as e: print(f"[WARNING] 保存PEFT模型时出错: {e}") +def inspect_model_structure(model): + """检查模型结构并打印关键层信息""" + print(f"Model type: {type(model).__name__}") + print(f"Model config: {model.config.__class__.__name__}") + + # 检查嵌入层 + embedding_layers = [] + for name, module in model.named_modules(): + if any(key in name for key in ['embed', 'wte', 'word_embeddings']): + embedding_layers.append((name, type(module).__name__)) + if hasattr(module, 'weight'): + print(f"Layer {name}: shape {module.weight.shape}") + + print(f"Found {len(embedding_layers)} potential embedding layers:") + for name, type_name in embedding_layers: + print(f" - {name}: {type_name}") + + # 检查注意力层 + print("\nAttention structure:") + for name, module in model.named_modules(): + if 'attention' in name.lower(): + print(f" - {name}: {type(module).__name__}") + if __name__ == "__main__": fire.Fire(train) diff --git a/kopa.py b/kopa.py index 8893bcf..ea362ed 100644 --- a/kopa.py +++ b/kopa.py @@ -8,18 +8,24 @@ from transformers import LlamaForCausalLM class KoPA(nn.Module): def __init__( self, - model: LlamaForCausalLM + model ) -> None: super(KoPA, self).__init__() self.llama_model = model - self.embeddings = nn.Embedding(100, 3072) - # self.embeddings = PrefixKGEmbedding( - # num_ent=2034, - # num_rel=42, - # dim_llm=3072, - # num_prefix=1 - # ) - + for param in self.model.parameters(): + param.requires_grad = False + + # Only keep gradients for the adapter parts + self.num_prefix = num_prefix + hidden_size = model.config.hidden_size + self.embeddings = nn.Embedding(100, 4096) + for param in model.parameters(): + param.requires_grad = False + + # Only enable gradients for adapter components + self.static_prefix_embedding.requires_grad_(True) + self.sensor_mlp.requires_grad_(True) + self.norm.requires_grad_(True) def forward( self, input_ids: torch.LongTensor = None, @@ -39,7 +45,16 @@ class KoPA(nn.Module): embedding_ids = torch.clamp(embedding_ids, min=0, max=self.embeddings.num_embeddings - 1) kg_embeds = self.embeddings(embedding_ids) batch_size, seq_len, _ = kg_embeds.shape - token_embeds = self.llama_model.model.model.embed_tokens(input_ids) + if hasattr(self.llama_model, 'transformer'): + # Qwen模型 + token_embeds = self.llama_model.transformer.wte(input_ids) + elif hasattr(self.llama_model, 'model') and hasattr(self.llama_model.model, 'embed_tokens'): + # 原始路径 + token_embeds = self.llama_model.model.model.embed_tokens(input_ids) + else: + # 添加调试代码 + print("无法找到模型嵌入层,尝试检测模型结构...") + raise ValueError("模型结构不兼容") input_embeds = torch.cat((kg_embeds, token_embeds), dim=1) prefix_mask = torch.ones((batch_size, seq_len)) prefix_labels = torch.full((batch_size, seq_len), fill_value=-100, dtype=torch.long) @@ -69,9 +84,13 @@ class KoPAWithAdapter(nn.Module): self.model = model self.num_prefix = num_prefix hidden_size = model.config.hidden_size - + + # 打印模型信息以便调试 + print(f"[INFO] 初始化KoPAWithAdapter,模型类型: {type(model).__name__}") + # 使用tokenizer获取vocab_size - vocab_size = tokenizer.vocab_size if tokenizer else 32000 + vocab_size = tokenizer.vocab_size if tokenizer else 151936 # Qwen2.5的默认词表大小 + print(f"[INFO] 使用词表大小: {vocab_size}") self.static_prefix_embedding = nn.Embedding(vocab_size, hidden_size) self.embeddings = self.static_prefix_embedding # 保留这个属性 @@ -87,6 +106,26 @@ class KoPAWithAdapter(nn.Module): self.norm = nn.LayerNorm(hidden_size) print(f"[INFO] 模型初始化: hidden_size={hidden_size}, vocab_size={vocab_size}") + + # 检测模型嵌入层路径 + self._detect_embedding_path() + + def _detect_embedding_path(self): + """检测模型的嵌入层路径""" + self.embedding_path = None + + # 尝试不同的常见路径 + if hasattr(self.model, 'transformer') and hasattr(self.model.transformer, 'wte'): + self.embedding_path = "transformer.wte" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'embed_tokens'): + self.embedding_path = "model.embed_tokens" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(self.model.model.model, 'embed_tokens'): + self.embedding_path = "model.model.model.embed_tokens" + + if self.embedding_path: + print(f"[INFO] 检测到嵌入层路径: {self.embedding_path}") + else: + print("[WARNING] 无法自动检测嵌入层路径,将在前向传播中尝试多种路径") def forward(self, input_ids, attention_mask, static_prefix=None, sensor_data=None, labels=None, **kwargs): batch_size, seq_len = input_ids.shape @@ -128,8 +167,37 @@ class KoPAWithAdapter(nn.Module): final_prefix = alpha * static_prefix + (1 - alpha) * dynamic_prefix final_prefix = self.norm(final_prefix) - # 处理token嵌入 - token_embeds = self.model.model.embed_tokens(input_ids) + # 处理token嵌入 - 根据检测到的路径获取嵌入 + try: + if self.embedding_path == "transformer.wte": + token_embeds = self.model.transformer.wte(input_ids) + elif self.embedding_path == "model.embed_tokens": + token_embeds = self.model.model.embed_tokens(input_ids) + elif self.embedding_path == "model.model.model.embed_tokens": + token_embeds = self.model.model.model.embed_tokens(input_ids) + else: + # 尝试多种可能的路径 + if hasattr(self.model, 'transformer') and hasattr(self.model.transformer, 'wte'): + token_embeds = self.model.transformer.wte(input_ids) + self.embedding_path = "transformer.wte" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'embed_tokens'): + token_embeds = self.model.model.embed_tokens(input_ids) + self.embedding_path = "model.embed_tokens" + elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(self.model.model.model, 'embed_tokens'): + token_embeds = self.model.model.model.embed_tokens(input_ids) + self.embedding_path = "model.model.model.embed_tokens" + else: + raise ValueError("无法找到嵌入层路径") + print(f"[INFO] 成功找到嵌入层路径: {self.embedding_path}") + except Exception as e: + print(f"[ERROR] 获取token嵌入失败: {e}") + # 打印模型结构以帮助调试 + print("模型结构:") + for name, _ in self.model.named_modules(): + if 'embed' in name or 'wte' in name: + print(f" - {name}") + raise + input_embeds = torch.cat((final_prefix, token_embeds), dim=1) # 扩展注意力掩码 @@ -140,7 +208,7 @@ class KoPAWithAdapter(nn.Module): ) extended_attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) - # ✨ 关键修复: 处理标签 + # 处理标签 if labels is not None: # 为前缀部分创建-100的标签(表示忽略) prefix_labels = torch.full( @@ -154,22 +222,15 @@ class KoPAWithAdapter(nn.Module): else: extended_labels = None - # 调试输出 - # print(f"[DEBUG] 原始输入大小: {input_ids.shape}") - # print(f"[DEBUG] 扩展嵌入大小: {input_embeds.shape}") - # print(f"[DEBUG] 扩展掩码大小: {extended_attention_mask.shape}") - # if extended_labels is not None: - # print(f"[DEBUG] 扩展标签大小: {extended_labels.shape}") - # 确保不提供input_ids if 'input_ids' in kwargs: del kwargs['input_ids'] - # ✨ 传递扩展后的标签 + # 传递扩展后的标签 return self.model( inputs_embeds=input_embeds, attention_mask=extended_attention_mask, - labels=extended_labels, # 这是关键修改 + labels=extended_labels, use_cache=False, **kwargs) diff --git a/output/dsp-qwen/runs/Mar17_16-37-42_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742200664.p-49fd0de7e984-ackcs-00gjef02.3924.0 b/output/dsp-qwen/runs/Mar17_16-37-42_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742200664.p-49fd0de7e984-ackcs-00gjef02.3924.0 new file mode 100644 index 0000000..0158a7f Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_16-37-42_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742200664.p-49fd0de7e984-ackcs-00gjef02.3924.0 differ diff --git a/output/dsp-qwen/runs/Mar17_16-39-36_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742200778.p-49fd0de7e984-ackcs-00gjef02.4520.0 b/output/dsp-qwen/runs/Mar17_16-39-36_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742200778.p-49fd0de7e984-ackcs-00gjef02.4520.0 new file mode 100644 index 0000000..2d14643 Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_16-39-36_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742200778.p-49fd0de7e984-ackcs-00gjef02.4520.0 differ diff --git a/output/dsp-qwen/runs/Mar17_16-51-48_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742201509.p-49fd0de7e984-ackcs-00gjef02.7056.0 b/output/dsp-qwen/runs/Mar17_16-51-48_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742201509.p-49fd0de7e984-ackcs-00gjef02.7056.0 new file mode 100644 index 0000000..483ea8d Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_16-51-48_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742201509.p-49fd0de7e984-ackcs-00gjef02.7056.0 differ diff --git a/output/dsp-qwen/runs/Mar17_19-14-23_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210064.p-49fd0de7e984-ackcs-00gjef02.33927.0 b/output/dsp-qwen/runs/Mar17_19-14-23_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210064.p-49fd0de7e984-ackcs-00gjef02.33927.0 new file mode 100644 index 0000000..8753ae9 Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_19-14-23_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210064.p-49fd0de7e984-ackcs-00gjef02.33927.0 differ diff --git a/output/dsp-qwen/runs/Mar17_19-17-32_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210253.p-49fd0de7e984-ackcs-00gjef02.34819.0 b/output/dsp-qwen/runs/Mar17_19-17-32_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210253.p-49fd0de7e984-ackcs-00gjef02.34819.0 new file mode 100644 index 0000000..9c0721d Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_19-17-32_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210253.p-49fd0de7e984-ackcs-00gjef02.34819.0 differ diff --git a/output/dsp-qwen/runs/Mar17_19-20-56_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210458.p-49fd0de7e984-ackcs-00gjef02.35818.0 b/output/dsp-qwen/runs/Mar17_19-20-56_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210458.p-49fd0de7e984-ackcs-00gjef02.35818.0 new file mode 100644 index 0000000..91c1b6d Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_19-20-56_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210458.p-49fd0de7e984-ackcs-00gjef02.35818.0 differ diff --git a/output/dsp-qwen/runs/Mar17_19-23-08_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210589.p-49fd0de7e984-ackcs-00gjef02.36470.0 b/output/dsp-qwen/runs/Mar17_19-23-08_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210589.p-49fd0de7e984-ackcs-00gjef02.36470.0 new file mode 100644 index 0000000..9e1a76c Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_19-23-08_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742210589.p-49fd0de7e984-ackcs-00gjef02.36470.0 differ diff --git a/output/dsp-qwen/runs/Mar17_19-46-24_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742211985.p-49fd0de7e984-ackcs-00gjef02.41048.0 b/output/dsp-qwen/runs/Mar17_19-46-24_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742211985.p-49fd0de7e984-ackcs-00gjef02.41048.0 new file mode 100644 index 0000000..62f5dce Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_19-46-24_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742211985.p-49fd0de7e984-ackcs-00gjef02.41048.0 differ diff --git a/output/dsp-qwen/runs/Mar17_19-49-50_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742212192.p-49fd0de7e984-ackcs-00gjef02.42000.0 b/output/dsp-qwen/runs/Mar17_19-49-50_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742212192.p-49fd0de7e984-ackcs-00gjef02.42000.0 new file mode 100644 index 0000000..400e566 Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_19-49-50_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742212192.p-49fd0de7e984-ackcs-00gjef02.42000.0 differ diff --git a/output/dsp-qwen/runs/Mar17_19-55-18_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742212519.p-49fd0de7e984-ackcs-00gjef02.43322.0 b/output/dsp-qwen/runs/Mar17_19-55-18_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742212519.p-49fd0de7e984-ackcs-00gjef02.43322.0 new file mode 100644 index 0000000..0572ec4 Binary files /dev/null and b/output/dsp-qwen/runs/Mar17_19-55-18_p-49fd0de7e984-ackcs-00gjef02/events.out.tfevents.1742212519.p-49fd0de7e984-ackcs-00gjef02.43322.0 differ diff --git a/utils/__pycache__/__init__.cpython-39.pyc b/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..de3652b Binary files /dev/null and b/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/utils/__pycache__/prompter.cpython-39.pyc b/utils/__pycache__/prompter.cpython-39.pyc new file mode 100644 index 0000000..7b584ac Binary files /dev/null and b/utils/__pycache__/prompter.cpython-39.pyc differ