更新qwen

2025-03-24 12:29:56 +08:00 · 2025-03-24 12:29:56 +08:00 · c373c8dd6f
commit c373c8dd6f
parent 979b93c284
2 changed files with 162 additions and 92 deletions
--- a/finetune_kopa.py
+++ b/finetune_kopa.py
@ -19,6 +19,43 @@ from peft import PrefixTuningConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from utils.prompter import Prompter
 import os
 os.environ["SAFETENSORS_FAST_SAVE"] = "0"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 解决 tokenizer 的 fork 报错
 def untie_shared_weights(model):
    print("[INFO] Untying shared weights in the model...")
    # For Qwen models, we need to handle specific weight sharing patterns
    if hasattr(model, "model") and hasattr(model.model, "base_model") and hasattr(model.model.base_model, "model"):
        base_model = model.model.base_model.model
        # Handle the first shared weights: embed_tokens and word_embeddings
        if hasattr(base_model, "embed_tokens") and hasattr(model.model, "word_embeddings"):
            if id(base_model.embed_tokens.weight) == id(model.model.word_embeddings.weight):
                print("[INFO] Untying shared weights between embed_tokens and word_embeddings")
                # Create a new tensor with the same values
                model.model.word_embeddings.weight = torch.nn.Parameter(
                    base_model.embed_tokens.weight.clone()
                )
    # Handle the second shared weights: embeddings and static_prefix_embedding
    if hasattr(model, "embeddings") and hasattr(model, "static_prefix_embedding"):
        if id(model.embeddings.weight) == id(model.static_prefix_embedding.weight):
            print("[INFO] Untying shared weights between embeddings and static_prefix_embedding")
            # Create a new tensor with the same values
            model.static_prefix_embedding.weight = torch.nn.Parameter(
                model.embeddings.weight.clone()
            )
    # Disable any tie_weights methods
    if hasattr(model, "tie_weights"):
        model.tie_weights = lambda: None
        print("[INFO] Disabled tie_weights method")
    print("[INFO] Completed untying shared weights")
 def custom_collate_fn(batch):
@ -60,15 +97,13 @@ def custom_collate_fn(batch):
        max_length = one_inputs.size(0) if max_length < one_inputs.size(0) else max_length
    input_ids_list_ = list()
    for one_inputs in input_ids_list:
-        input_ids_list_.append(torch.cat((one_inputs, torch.full((max_length-one_inputs.size(0),), 0, dtype=torch.int)), dim=-1))
+        input_ids_list_.append(
-
+            torch.cat((one_inputs, torch.full((max_length - one_inputs.size(0),), 151645, dtype=torch.int)), dim=-1))
    attention_mask_list_ = list()
    for mask in attention_mask_list:
-        attention_mask_list_.append(torch.cat((mask, torch.full((max_length-mask.size(0),), 0, dtype=torch.int)), dim=-1))
+        attention_mask_list_.append(
-    
+            torch.cat((mask, torch.full((max_length - mask.size(0),), 0, dtype=torch.int)), dim=-1))
    # print("=====",input_ids_list)
    # exit(0)
    # 堆叠数据
    result = {
@ -92,8 +127,8 @@ def custom_collate_fn(batch):
            labels_list.append(labels)
        labels_list_ = list()
        for label in labels_list:
-            labels_list_.append(torch.cat((label, torch.full((max_length-label.size(0),), 0, dtype=torch.int)), dim=-1))
+            labels_list_.append(
-            
+                torch.cat((label, torch.full((max_length - label.size(0),), 151645, dtype=torch.int)), dim=-1))
        result["labels"] = torch.stack(labels_list_)
@ -163,7 +198,7 @@ def train(
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
-        load_in_8bit=True,
+        load_in_8bit=False,
        # 使用Auto类自动选择正确的模型类型
        torch_dtype=torch.float16,
        device_map=device_map,
@ -177,7 +212,8 @@ def train(
    )
    tokenizer.pad_token = tokenizer.eos_token
-
+    # print("=====",model.config.eos_token_id)
    # exit(0)
    # tokenizer.pad_token_id = (
    #     0  # unk. we want this to be different from the eos token
@ -203,28 +239,6 @@ def train(
        return dataset
    # def tokenize(prompt, add_eos_token=True):
    #     # there's probably a way to do this with the tokenizer settings
    #     # but again, gotta move fast
    #     result = tokenizer(
    #         prompt,
    #         truncation=True,
    #         max_length=cutoff_len,
    #         padding=False,
    #         return_tensors=None,
    #     )
    #     if (
    #             result["input_ids"][-1] != tokenizer.eos_token_id
    #             and len(result["input_ids"]) < cutoff_len
    #             and add_eos_token
    #     ):
    #         result["input_ids"].append(tokenizer.eos_token_id)
    #         result["attention_mask"].append(1)
    #
    #     result["labels"] = result["input_ids"].copy()
    #
    #     return result
    def generate_and_tokenize_prompt(data_point):
        full_prompt = prompter.generate_prompt(
            data_point["instruction"],
@ -244,7 +258,6 @@ def train(
        # exit(0)
        tokenized_full_prompt = {k: v.squeeze(0) for k, v in tokenized_full_prompt.items()}
        # 处理静态前缀
@ -319,7 +332,6 @@ def train(
        tokenized_full_prompt["labels"] = tokenized_full_prompt["input_ids"].clone()
        # 如果不想对输入部分计算损失，可以将输入部分的标签设为-100
        if not train_on_inputs:
            # 找到用户输入和助手输出的分界点
            sep = tokenizer.encode(prompter.separator)
@ -331,7 +343,6 @@ def train(
        return tokenized_full_prompt
        # 创建PrefixTuning配置
    prefix_config = PrefixTuningConfig(
@ -342,7 +353,6 @@ def train(
    # 创建PEFT模型
    peft_model = get_peft_model(model, prefix_config)
    # 创建最终的KoPAWithAdapter模型
    final_model = KoPAWithAdapter(peft_model, num_prefix, tokenizer)
    device = next(model.parameters()).device
@ -351,7 +361,6 @@ def train(
    # 确保final_model及其组件都在相同设备上
    final_model = final_model.to(device)
    if data_path.endswith(".json") or data_path.endswith(".jsonl"):
        data = load_dataset("json", data_files=data_path)
    else:
@ -400,6 +409,10 @@ def train(
        model.is_parallelizable = True
        model.model_parallel = True
    untie_shared_weights(final_model)
    # For KoPAWithAdapter models, we need a custom save approach
    trainer = transformers.Trainer(
        model=final_model,
        data_collator=custom_collate_fn,
@ -411,13 +424,13 @@ def train(
            warmup_steps=100,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
-            fp16=True,
+            fp16=False,
            logging_steps=10,
            optim="adamw_hf",
            evaluation_strategy="steps" if val_set_size > 0 else "no",
            save_strategy="steps",
            eval_steps=None,
-            save_steps=5000,
+            save_steps=10,
            output_dir=output_dir,
            save_total_limit=2,
            load_best_model_at_end=True if val_set_size > 0 else False,
@ -432,21 +445,63 @@ def train(
    if torch.__version__ >= "2" and sys.platform != "win32":
        final_model = torch.compile(model)
    untie_shared_weights(final_model)
    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
-    final_model.save_pretrained(output_dir)
+    try:
        final_model = untie_shared_weights(final_model)
        print(f"[INFO] Saving model to {output_dir}")
-    # ⭐ 确保embeddings存在再保存
+        # 确保输出目录存在
-    if hasattr(final_model, "embeddings"):
+        os.makedirs(output_dir, exist_ok=True)
-        torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
+        
-    else:
+        # 如果是分布式训练，只在主进程保存
-        print("[WARNING] final_model没有embeddings属性，跳过保存。")
+        if int(os.environ.get("LOCAL_RANK", 0)) == 0:
            # 将模型移到CPU上保存
            model_to_save = final_model.module if hasattr(final_model, "module") else final_model
            model_to_save = model_to_save.cpu()
            try:
-        final_model.model.save_pretrained(os.path.join(output_dir, "peft_model"))
+                # Save the main model components
-        print(f"[INFO] PEFT模型保存到 {os.path.join(output_dir, 'peft_model')}")
+                if hasattr(final_model, "save_model"):
                    final_model.save_model(output_dir)
                else:
                    # Save model configuration
                    if hasattr(final_model, "config"):
                        final_model.config.save_pretrained(output_dir)
                    # Save model state dict
                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
                    print(f"[INFO] Successfully saved model state dict")
                    # Save embeddings separately if they exist
                    if hasattr(final_model, "embeddings"):
                        torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
                        print(f"[INFO] Successfully saved embeddings")
                    # Save PEFT model components
                    if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
                        peft_save_dir = os.path.join(output_dir, "peft_model")
                        os.makedirs(peft_save_dir, exist_ok=True)
                        final_model.model.save_pretrained(peft_save_dir)
                        print(f"[INFO] PEFT model saved to {peft_save_dir}")
                # 保存完成后将模型移回原设备
                model_to_save = model_to_save.to(device)
            except Exception as e:
-        print(f"[WARNING] 保存PEFT模型时出错: {e}")
+                print(f"[ERROR] Error during model saving: {str(e)}")
                import traceback
                traceback.print_exc()
                raise e
    except Exception as e:
        print(f"[ERROR] Error in save process: {str(e)}")
        import traceback
        traceback.print_exc()
        raise e
 def inspect_model_structure(model):
    """检查模型结构并打印关键层信息"""
--- a/kopa.py
+++ b/kopa.py
@ -16,7 +16,7 @@ class KoPA(nn.Module):
            param.requires_grad = False
        # Only keep gradients for the adapter parts
-        self.num_prefix = num_prefix
+        # self.num_prefix = num_prefix
        hidden_size = model.config.hidden_size
        self.embeddings = nn.Embedding(100, 4096)
        for param in model.parameters():
@ -26,6 +26,7 @@ class KoPA(nn.Module):
        self.static_prefix_embedding.requires_grad_(True)
        self.sensor_mlp.requires_grad_(True)
        self.norm.requires_grad_(True)
    def forward(
            self,
            input_ids: torch.LongTensor = None,
@ -119,7 +120,8 @@ class KoPAWithAdapter(nn.Module):
            self.embedding_path = "transformer.wte"
        elif hasattr(self.model, 'model') and hasattr(self.model.model, 'embed_tokens'):
            self.embedding_path = "model.embed_tokens"
-        elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(self.model.model.model, 'embed_tokens'):
+        elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(self.model.model.model,
                                                                                             'embed_tokens'):
            self.embedding_path = "model.model.model.embed_tokens"
        if self.embedding_path:
@ -183,7 +185,8 @@ class KoPAWithAdapter(nn.Module):
                elif hasattr(self.model, 'model') and hasattr(self.model.model, 'embed_tokens'):
                    token_embeds = self.model.model.embed_tokens(input_ids)
                    self.embedding_path = "model.embed_tokens"
-                elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(self.model.model.model, 'embed_tokens'):
+                elif hasattr(self.model, 'model') and hasattr(self.model.model, 'model') and hasattr(
                        self.model.model.model, 'embed_tokens'):
                    token_embeds = self.model.model.model.embed_tokens(input_ids)
                    self.embedding_path = "model.model.model.embed_tokens"
                else:
@ -226,6 +229,18 @@ class KoPAWithAdapter(nn.Module):
        if 'input_ids' in kwargs:
            del kwargs['input_ids']
        model_dtype = next(self.model.parameters()).dtype
        input_embeds = input_embeds.to(dtype=model_dtype)
        # Remaining code as before...
        prefix_attention_mask = torch.ones(
            (batch_size, self.num_prefix),
            dtype=attention_mask.dtype,
            device=device
        )
        extended_attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
        extended_attention_mask = extended_attention_mask.to(dtype=model_dtype)
        # 传递扩展后的标签
        return self.model(
            inputs_embeds=input_embeds,
@ -234,6 +249,7 @@ class KoPAWithAdapter(nn.Module):
            use_cache=False,
            **kwargs)
 # class PrefixKGEmbedding(nn.Module):
 #     def __init__(
 #         self,
@ -276,7 +292,6 @@ class PretrainKGEmbedding(nn.Module):
        self.rel_embeddings.requires_grad_(False)
        self.adapter = nn.Linear(self.pretrain_dim, self.emb_dim)
    def forward(self, triple_ids):
        # main training stage
        if triple_ids.shape[1] == 3: