云服务器版

This commit is contained in:
黄子寒 2025-03-27 13:54:14 +08:00
parent c373c8dd6f
commit d9965e3ea3
2 changed files with 28 additions and 46 deletions

View File

@ -141,13 +141,13 @@ def train(
data_path: str = "/root/shared-nvme/dataset/olive_dataset.json",
output_dir: str = "output",
# training hyperparams
batch_size: int = 16,
micro_batch_size: int = 16,
batch_size: int = 8,
micro_batch_size: int = 4,
num_epochs: int = 2,
learning_rate: float = 1e-4,
learning_rate: float = 1e-5,
cutoff_len: int = 512,
val_set_size: int = 0,
num_prefix: int = 1,
num_prefix: int = 10,
# llm hyperparams
train_on_inputs: bool = True, # if False, masks out inputs in loss
add_eos_token: bool = False,
@ -453,54 +453,36 @@ def train(
final_model = untie_shared_weights(final_model)
print(f"[INFO] Saving model to {output_dir}")
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 如果是分布式训练,只在主进程保存
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
# 将模型移到CPU上保存
# Save the main model components
if hasattr(final_model, "save_model"):
final_model.save_model(output_dir)
else:
# Fallback if save_model method doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Save model configuration
if hasattr(final_model, "config"):
final_model.config.save_pretrained(output_dir)
# Save model state dict (avoiding shared weights)
model_to_save = final_model.module if hasattr(final_model, "module") else final_model
model_to_save = model_to_save.cpu()
try:
# Save the main model components
if hasattr(final_model, "save_model"):
final_model.save_model(output_dir)
else:
# Save model configuration
if hasattr(final_model, "config"):
final_model.config.save_pretrained(output_dir)
torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
# Save model state dict
torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
print(f"[INFO] Successfully saved model state dict")
# Save embeddings separately if they exist
if hasattr(final_model, "embeddings"):
torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
# Save embeddings separately if they exist
if hasattr(final_model, "embeddings"):
torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
print(f"[INFO] Successfully saved embeddings")
# Save PEFT model components
if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
peft_save_dir = os.path.join(output_dir, "peft_model")
os.makedirs(peft_save_dir, exist_ok=True)
final_model.model.save_pretrained(peft_save_dir)
print(f"[INFO] PEFT model saved to {peft_save_dir}")
# 保存完成后将模型移回原设备
model_to_save = model_to_save.to(device)
except Exception as e:
print(f"[ERROR] Error during model saving: {str(e)}")
import traceback
traceback.print_exc()
raise e
# Save PEFT model components
if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
peft_save_dir = os.path.join(output_dir, "peft_model")
os.makedirs(peft_save_dir, exist_ok=True)
final_model.model.save_pretrained(peft_save_dir)
print(f"[INFO] PEFT model saved to {peft_save_dir}")
except Exception as e:
print(f"[ERROR] Error in save process: {str(e)}")
print(f"[ERROR] Error saving model: {e}")
import traceback
traceback.print_exc()
raise e
def inspect_model_structure(model):

View File

@ -16,7 +16,7 @@ class KoPA(nn.Module):
param.requires_grad = False
# Only keep gradients for the adapter parts
# self.num_prefix = num_prefix
self.num_prefix = num_prefix
hidden_size = model.config.hidden_size
self.embeddings = nn.Embedding(100, 4096)
for param in model.parameters():