diff --git a/finetune_kopa.py b/finetune_kopa.py index 8e37193..eb23207 100644 --- a/finetune_kopa.py +++ b/finetune_kopa.py @@ -141,13 +141,13 @@ def train( data_path: str = "/root/shared-nvme/dataset/olive_dataset.json", output_dir: str = "output", # training hyperparams - batch_size: int = 16, - micro_batch_size: int = 16, + batch_size: int = 8, + micro_batch_size: int = 4, num_epochs: int = 2, - learning_rate: float = 1e-4, + learning_rate: float = 1e-5, cutoff_len: int = 512, val_set_size: int = 0, - num_prefix: int = 1, + num_prefix: int = 10, # llm hyperparams train_on_inputs: bool = True, # if False, masks out inputs in loss add_eos_token: bool = False, @@ -453,54 +453,36 @@ def train( final_model = untie_shared_weights(final_model) print(f"[INFO] Saving model to {output_dir}") - # 确保输出目录存在 - os.makedirs(output_dir, exist_ok=True) - - # 如果是分布式训练,只在主进程保存 - if int(os.environ.get("LOCAL_RANK", 0)) == 0: - # 将模型移到CPU上保存 + # Save the main model components + if hasattr(final_model, "save_model"): + final_model.save_model(output_dir) + else: + # Fallback if save_model method doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Save model configuration + if hasattr(final_model, "config"): + final_model.config.save_pretrained(output_dir) + + # Save model state dict (avoiding shared weights) model_to_save = final_model.module if hasattr(final_model, "module") else final_model - model_to_save = model_to_save.cpu() - - try: - # Save the main model components - if hasattr(final_model, "save_model"): - final_model.save_model(output_dir) - else: - # Save model configuration - if hasattr(final_model, "config"): - final_model.config.save_pretrained(output_dir) + torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) - # Save model state dict - torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) - print(f"[INFO] Successfully saved model state dict") + # Save embeddings separately if they exist + if hasattr(final_model, "embeddings"): + torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth")) - # Save embeddings separately if they exist - if hasattr(final_model, "embeddings"): - torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth")) - print(f"[INFO] Successfully saved embeddings") - - # Save PEFT model components - if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"): - peft_save_dir = os.path.join(output_dir, "peft_model") - os.makedirs(peft_save_dir, exist_ok=True) - final_model.model.save_pretrained(peft_save_dir) - print(f"[INFO] PEFT model saved to {peft_save_dir}") - - # 保存完成后将模型移回原设备 - model_to_save = model_to_save.to(device) - - except Exception as e: - print(f"[ERROR] Error during model saving: {str(e)}") - import traceback - traceback.print_exc() - raise e + # Save PEFT model components + if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"): + peft_save_dir = os.path.join(output_dir, "peft_model") + os.makedirs(peft_save_dir, exist_ok=True) + final_model.model.save_pretrained(peft_save_dir) + print(f"[INFO] PEFT model saved to {peft_save_dir}") except Exception as e: - print(f"[ERROR] Error in save process: {str(e)}") + print(f"[ERROR] Error saving model: {e}") import traceback traceback.print_exc() - raise e def inspect_model_structure(model): diff --git a/kopa.py b/kopa.py index 13b41ab..9191627 100644 --- a/kopa.py +++ b/kopa.py @@ -16,7 +16,7 @@ class KoPA(nn.Module): param.requires_grad = False # Only keep gradients for the adapter parts - # self.num_prefix = num_prefix + self.num_prefix = num_prefix hidden_size = model.config.hidden_size self.embeddings = nn.Embedding(100, 4096) for param in model.parameters():