云服务器版

2025-03-27 13:54:14 +08:00 · 2025-03-27 13:54:14 +08:00 · d9965e3ea3
commit d9965e3ea3
parent c373c8dd6f
2 changed files with 28 additions and 46 deletions
--- a/finetune_kopa.py
+++ b/finetune_kopa.py
@ -141,13 +141,13 @@ def train(
        data_path: str = "/root/shared-nvme/dataset/olive_dataset.json",
        output_dir: str = "output",
        # training hyperparams
-        batch_size: int = 16,
+        batch_size: int = 8,
-        micro_batch_size: int = 16,
+        micro_batch_size: int = 4,
        num_epochs: int = 2,
-        learning_rate: float = 1e-4,
+        learning_rate: float = 1e-5,
        cutoff_len: int = 512,
        val_set_size: int = 0,
-        num_prefix: int = 1,
+        num_prefix: int = 10,
        # llm hyperparams
        train_on_inputs: bool = True,  # if False, masks out inputs in loss
        add_eos_token: bool = False,
@ -453,54 +453,36 @@ def train(
        final_model = untie_shared_weights(final_model)
        print(f"[INFO] Saving model to {output_dir}")
-        # 确保输出目录存在
+        # Save the main model components
-        os.makedirs(output_dir, exist_ok=True)
+        if hasattr(final_model, "save_model"):
-        
+            final_model.save_model(output_dir)
-        # 如果是分布式训练，只在主进程保存
+        else:
-        if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+            # Fallback if save_model method doesn't exist
-            # 将模型移到CPU上保存
+            os.makedirs(output_dir, exist_ok=True)
            # Save model configuration
            if hasattr(final_model, "config"):
                final_model.config.save_pretrained(output_dir)
            # Save model state dict (avoiding shared weights)
            model_to_save = final_model.module if hasattr(final_model, "module") else final_model
-            model_to_save = model_to_save.cpu()
+            torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
            try:
                # Save the main model components
                if hasattr(final_model, "save_model"):
                    final_model.save_model(output_dir)
                else:
                    # Save model configuration
                    if hasattr(final_model, "config"):
                        final_model.config.save_pretrained(output_dir)
-                    # Save model state dict
+        # Save embeddings separately if they exist
-                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
+        if hasattr(final_model, "embeddings"):
-                    print(f"[INFO] Successfully saved model state dict")
+            torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
-                    # Save embeddings separately if they exist
+        # Save PEFT model components
-                    if hasattr(final_model, "embeddings"):
+        if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
-                        torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
+            peft_save_dir = os.path.join(output_dir, "peft_model")
-                        print(f"[INFO] Successfully saved embeddings")
+            os.makedirs(peft_save_dir, exist_ok=True)
-
+            final_model.model.save_pretrained(peft_save_dir)
-                    # Save PEFT model components
+            print(f"[INFO] PEFT model saved to {peft_save_dir}")
                    if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
                        peft_save_dir = os.path.join(output_dir, "peft_model")
                        os.makedirs(peft_save_dir, exist_ok=True)
                        final_model.model.save_pretrained(peft_save_dir)
                        print(f"[INFO] PEFT model saved to {peft_save_dir}")
                # 保存完成后将模型移回原设备
                model_to_save = model_to_save.to(device)
            except Exception as e:
                print(f"[ERROR] Error during model saving: {str(e)}")
                import traceback
                traceback.print_exc()
                raise e
    except Exception as e:
-        print(f"[ERROR] Error in save process: {str(e)}")
+        print(f"[ERROR] Error saving model: {e}")
        import traceback
        traceback.print_exc()
        raise e
 def inspect_model_structure(model):
--- a/kopa.py
+++ b/kopa.py
@ -16,7 +16,7 @@ class KoPA(nn.Module):
            param.requires_grad = False
        # Only keep gradients for the adapter parts
-        # self.num_prefix = num_prefix
+        self.num_prefix = num_prefix
        hidden_size = model.config.hidden_size
        self.embeddings = nn.Embedding(100, 4096)
        for param in model.parameters():