云服务器版
This commit is contained in:
parent
c373c8dd6f
commit
d9965e3ea3
@ -141,13 +141,13 @@ def train(
|
|||||||
data_path: str = "/root/shared-nvme/dataset/olive_dataset.json",
|
data_path: str = "/root/shared-nvme/dataset/olive_dataset.json",
|
||||||
output_dir: str = "output",
|
output_dir: str = "output",
|
||||||
# training hyperparams
|
# training hyperparams
|
||||||
batch_size: int = 16,
|
batch_size: int = 8,
|
||||||
micro_batch_size: int = 16,
|
micro_batch_size: int = 4,
|
||||||
num_epochs: int = 2,
|
num_epochs: int = 2,
|
||||||
learning_rate: float = 1e-4,
|
learning_rate: float = 1e-5,
|
||||||
cutoff_len: int = 512,
|
cutoff_len: int = 512,
|
||||||
val_set_size: int = 0,
|
val_set_size: int = 0,
|
||||||
num_prefix: int = 1,
|
num_prefix: int = 10,
|
||||||
# llm hyperparams
|
# llm hyperparams
|
||||||
train_on_inputs: bool = True, # if False, masks out inputs in loss
|
train_on_inputs: bool = True, # if False, masks out inputs in loss
|
||||||
add_eos_token: bool = False,
|
add_eos_token: bool = False,
|
||||||
@ -453,54 +453,36 @@ def train(
|
|||||||
final_model = untie_shared_weights(final_model)
|
final_model = untie_shared_weights(final_model)
|
||||||
print(f"[INFO] Saving model to {output_dir}")
|
print(f"[INFO] Saving model to {output_dir}")
|
||||||
|
|
||||||
# 确保输出目录存在
|
# Save the main model components
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
if hasattr(final_model, "save_model"):
|
||||||
|
final_model.save_model(output_dir)
|
||||||
# 如果是分布式训练,只在主进程保存
|
else:
|
||||||
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
|
# Fallback if save_model method doesn't exist
|
||||||
# 将模型移到CPU上保存
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Save model configuration
|
||||||
|
if hasattr(final_model, "config"):
|
||||||
|
final_model.config.save_pretrained(output_dir)
|
||||||
|
|
||||||
|
# Save model state dict (avoiding shared weights)
|
||||||
model_to_save = final_model.module if hasattr(final_model, "module") else final_model
|
model_to_save = final_model.module if hasattr(final_model, "module") else final_model
|
||||||
model_to_save = model_to_save.cpu()
|
torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
|
||||||
|
|
||||||
try:
|
|
||||||
# Save the main model components
|
|
||||||
if hasattr(final_model, "save_model"):
|
|
||||||
final_model.save_model(output_dir)
|
|
||||||
else:
|
|
||||||
# Save model configuration
|
|
||||||
if hasattr(final_model, "config"):
|
|
||||||
final_model.config.save_pretrained(output_dir)
|
|
||||||
|
|
||||||
# Save model state dict
|
# Save embeddings separately if they exist
|
||||||
torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
|
if hasattr(final_model, "embeddings"):
|
||||||
print(f"[INFO] Successfully saved model state dict")
|
torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
|
||||||
|
|
||||||
# Save embeddings separately if they exist
|
# Save PEFT model components
|
||||||
if hasattr(final_model, "embeddings"):
|
if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
|
||||||
torch.save(final_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
|
peft_save_dir = os.path.join(output_dir, "peft_model")
|
||||||
print(f"[INFO] Successfully saved embeddings")
|
os.makedirs(peft_save_dir, exist_ok=True)
|
||||||
|
final_model.model.save_pretrained(peft_save_dir)
|
||||||
# Save PEFT model components
|
print(f"[INFO] PEFT model saved to {peft_save_dir}")
|
||||||
if hasattr(final_model, "model") and hasattr(final_model.model, "save_pretrained"):
|
|
||||||
peft_save_dir = os.path.join(output_dir, "peft_model")
|
|
||||||
os.makedirs(peft_save_dir, exist_ok=True)
|
|
||||||
final_model.model.save_pretrained(peft_save_dir)
|
|
||||||
print(f"[INFO] PEFT model saved to {peft_save_dir}")
|
|
||||||
|
|
||||||
# 保存完成后将模型移回原设备
|
|
||||||
model_to_save = model_to_save.to(device)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[ERROR] Error during model saving: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
raise e
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ERROR] Error in save process: {str(e)}")
|
print(f"[ERROR] Error saving model: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
raise e
|
|
||||||
|
|
||||||
|
|
||||||
def inspect_model_structure(model):
|
def inspect_model_structure(model):
|
||||||
|
2
kopa.py
2
kopa.py
@ -16,7 +16,7 @@ class KoPA(nn.Module):
|
|||||||
param.requires_grad = False
|
param.requires_grad = False
|
||||||
|
|
||||||
# Only keep gradients for the adapter parts
|
# Only keep gradients for the adapter parts
|
||||||
# self.num_prefix = num_prefix
|
self.num_prefix = num_prefix
|
||||||
hidden_size = model.config.hidden_size
|
hidden_size = model.config.hidden_size
|
||||||
self.embeddings = nn.Embedding(100, 4096)
|
self.embeddings = nn.Embedding(100, 4096)
|
||||||
for param in model.parameters():
|
for param in model.parameters():
|
||||||
|
Loading…
Reference in New Issue
Block a user