云服务器版
This commit is contained in:
parent
c373c8dd6f
commit
d9965e3ea3
@ -141,13 +141,13 @@ def train(
|
||||
data_path: str = "/root/shared-nvme/dataset/olive_dataset.json",
|
||||
output_dir: str = "output",
|
||||
# training hyperparams
|
||||
batch_size: int = 16,
|
||||
micro_batch_size: int = 16,
|
||||
batch_size: int = 8,
|
||||
micro_batch_size: int = 4,
|
||||
num_epochs: int = 2,
|
||||
learning_rate: float = 1e-4,
|
||||
learning_rate: float = 1e-5,
|
||||
cutoff_len: int = 512,
|
||||
val_set_size: int = 0,
|
||||
num_prefix: int = 1,
|
||||
num_prefix: int = 10,
|
||||
# llm hyperparams
|
||||
train_on_inputs: bool = True, # if False, masks out inputs in loss
|
||||
add_eos_token: bool = False,
|
||||
@ -453,54 +453,36 @@ def train(
|
||||
final_model = untie_shared_weights(final_model)
|
||||
print(f"[INFO] Saving model to {output_dir}")
|
||||
|
||||
# 确保输出目录存在
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 如果是分布式训练,只在主进程保存
|
||||
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
|
||||
# 将模型移到CPU上保存
|
||||
# Save the main model components
|
||||
if hasattr(final_model, "save_model"):
|
||||
final_model.save_model(output_dir)
|
||||
else:
|
||||
# Fallback if save_model method doesn't exist
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Save model configuration
|
||||
if hasattr(final_model, "config"):
|
||||
final_model.config.save_pretrained(output_dir)
|
||||
|
||||
# Save model state dict (avoiding shared weights)
|
||||
model_to_save = final_model.module if hasattr(final_model, "module") else final_model
|
||||
model_to_save = model_to_save.cpu()
|
||||
|
||||
try:
|
||||
# Save the main model components
|
||||
if hasattr(final_model, "save_model"):
|
||||
final_model.save_model(output_dir)
|
||||
else:
|
||||
# Save model configuration
|
||||
if hasattr(final_model, "config"):
|
||||
final_model.config.save_pretrained(output_dir)
|
||||
torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
|
||||
|
||||
# Save model state dict
|
||||
torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
|
||||
print(f"[INFO] Successfully saved model state dict")
|
||||
# Save embeddings separately if they exist
|
||||
if hasattr(final_model, "embeddings"):
|
||||
torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
|
||||
|
||||
# Save embeddings separately if they exist
|
||||
if hasattr(final_model, "embeddings"):
|
||||
torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
|
||||
print(f"[INFO] Successfully saved embeddings")
|
||||
|
||||
# Save PEFT model components
|
||||
if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
|
||||
peft_save_dir = os.path.join(output_dir, "peft_model")
|
||||
os.makedirs(peft_save_dir, exist_ok=True)
|
||||
final_model.model.save_pretrained(peft_save_dir)
|
||||
print(f"[INFO] PEFT model saved to {peft_save_dir}")
|
||||
|
||||
# 保存完成后将模型移回原设备
|
||||
model_to_save = model_to_save.to(device)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Error during model saving: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise e
|
||||
# Save PEFT model components
|
||||
if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
|
||||
peft_save_dir = os.path.join(output_dir, "peft_model")
|
||||
os.makedirs(peft_save_dir, exist_ok=True)
|
||||
final_model.model.save_pretrained(peft_save_dir)
|
||||
print(f"[INFO] PEFT model saved to {peft_save_dir}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Error in save process: {str(e)}")
|
||||
print(f"[ERROR] Error saving model: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise e
|
||||
|
||||
|
||||
def inspect_model_structure(model):
|
||||
|
2
kopa.py
2
kopa.py
@ -16,7 +16,7 @@ class KoPA(nn.Module):
|
||||
param.requires_grad = False
|
||||
|
||||
# Only keep gradients for the adapter parts
|
||||
# self.num_prefix = num_prefix
|
||||
self.num_prefix = num_prefix
|
||||
hidden_size = model.config.hidden_size
|
||||
self.embeddings = nn.Embedding(100, 4096)
|
||||
for param in model.parameters():
|
||||
|
Loading…
Reference in New Issue
Block a user