first commit

This commit is contained in:
Zhang-Each 2023-10-11 11:51:08 +08:00
parent 9003e58b3f
commit 6362475d20
24 changed files with 14218 additions and 1 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 ZJUKG
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1 +1,51 @@
# KoPA
# Making Large Language Models Perform Better in Knowledge Graph Completion
![](https://img.shields.io/badge/version-1.0.1-blue)
[![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/zjukg/KoPA/main/LICENSE)
[![AAAI](https://img.shields.io/badge/NLPCC'23-brightgreen)](http://tcci.ccf.org.cn/conference/2023/)
[![Pytorch](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?e&logo=PyTorch&logoColor=white)](https://pytorch.org/)
- [Making Large Language Models Perform Better in Knowledge Graph Completion
](https://arxiv.org/abs/2310.06671)
> Large language model (LLM) based knowledge graph completion (KGC) aims to predict the missing triples in the KGs with LLMs and enrich the KGs to become better web infrastructure, which can benefit a lot of web-based automatic services. However, research about LLM-based KGC is limited and lacks effective utilization of LLM's inference capabilities, which ignores the important structural information in KGs and prevents LLMs from acquiring accurate factual knowledge. In this paper, we discuss how to incorporate the helpful KG structural information into the LLMs, aiming to achieve structrual-aware reasoning in the LLMs. We first transfer the existing LLM paradigms to structural-aware settings and further propose a knowledge prefix adapter (KoPA) to fulfill this stated goal. KoPA employs structural embedding pre-training to capture the structural information of entities and relations in the KG. Then KoPA informs the LLMs of the knowledge prefix adapter which projects the structural embeddings into the textual space and obtains virtual knowledge tokens as a prefix of the input prompt. We conduct comprehensive experiments on these structural-aware LLM-based KGC methods and provide an in-depth analysis comparing how the introduction of structural information would be better for LLM's knowledge reasoning ability.
## 🌈 Model Architecture
![Model_architecture](figure/model.png)
## 🔬 Dependencies
Our code is developed based on [alpaca-lora](https://github.com/tloen/alpaca-lora). Please build the Python environment following the instruction in Alpaca-lora.
## 📕 Training & Test
- run KoPA tuning
```shell
export WANDB_DISABLED=true
wandb offline
CUDA_VISIBLE_DEVICES=0 nohup python finetune_kopa.py \
--base_model 'YOUR LLM PATH' \
--data_path 'data/UMLS-train.json' \
--output_dir 'YOUR SAVE PATH' \
--num_epochs 3 \
--lora_r 32 \
--learning_rate 3e-4 \
--batch_size 12 \
--micro_batch_size 12 \
--num_prefix 1 \
--kge_model 'data/UMLS-rotate.pth' \
--lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' > log.txt &
```
You may need to fill the LLM path and save path before running.
- run inference
```shell
CUDA_VISIBLE_DEVICES=0 python inference_kopa.py
```
## 🤝 Cite:
Please condiser citing this paper if you use the code from our work.
Thanks a lot :)
```bigquery
TBD
```

BIN
data/UMLS-rotate.pth Normal file

Binary file not shown.

1
data/UMLS-test.json Normal file

File diff suppressed because one or more lines are too long

1
data/UMLS-train.json Normal file

File diff suppressed because one or more lines are too long

13042
data/UMLS-valid.json Normal file

File diff suppressed because it is too large Load Diff

BIN
figure/model.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 660 KiB

273
finetune.py Normal file
View File

@ -0,0 +1,273 @@
import os
import sys
from typing import List
import time
import fire
import torch
import transformers
from datasets import load_dataset
"""
Unused imports:
import torch.nn as nn
import bitsandbytes as bnb
"""
from peft import (
LoraConfig,
get_peft_model,
get_peft_model_state_dict,
prepare_model_for_int8_training,
set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer
from utils.prompter import Prompter
def train(
# model/data params
base_model: str = "", # the only required argument
data_path: str = "YOUR LLM PATH",
output_dir: str = "./lora-alpaca",
# training hyperparams
batch_size: int = 16,
micro_batch_size: int = 16,
num_epochs: int = 2,
learning_rate: float = 3e-4,
cutoff_len: int = 512,
val_set_size: int = 0,
# lora hyperparams
lora_r: int = 16,
lora_alpha: int = 16,
lora_dropout: float = 0.05,
lora_target_modules: List[str] = [
"q_proj",
"v_proj",
],
# llm hyperparams
train_on_inputs: bool = True, # if False, masks out inputs in loss
add_eos_token: bool = False,
group_by_length: bool = False, # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = "",
wandb_run_name: str = "",
wandb_watch: str = "", # options: false | gradients | all
wandb_log_model: str = "", # options: false | true
resume_from_checkpoint: str = None, # either training checkpoint or final adapter
prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca.
):
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
print(
f"Training Alpaca-LoRA model with params:\n"
f"base_model: {base_model}\n"
f"data_path: {data_path}\n"
f"output_dir: {output_dir}\n"
f"batch_size: {batch_size}\n"
f"micro_batch_size: {micro_batch_size}\n"
f"num_epochs: {num_epochs}\n"
f"learning_rate: {learning_rate}\n"
f"cutoff_len: {cutoff_len}\n"
f"val_set_size: {val_set_size}\n"
f"lora_r: {lora_r}\n"
f"lora_alpha: {lora_alpha}\n"
f"lora_dropout: {lora_dropout}\n"
f"lora_target_modules: {lora_target_modules}\n"
f"train_on_inputs: {train_on_inputs}\n"
f"add_eos_token: {add_eos_token}\n"
f"group_by_length: {group_by_length}\n"
f"wandb_project: {wandb_project}\n"
f"wandb_run_name: {wandb_run_name}\n"
f"wandb_watch: {wandb_watch}\n"
f"wandb_log_model: {wandb_log_model}\n"
f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
f"prompt template: {prompt_template_name}\n"
)
assert (
base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
gradient_accumulation_steps = batch_size // micro_batch_size
prompter = Prompter(prompt_template_name)
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
gradient_accumulation_steps = gradient_accumulation_steps // world_size
model = LlamaForCausalLM.from_pretrained(
base_model,
# load_in_8bit=True,
torch_dtype=torch.float16,
device_map=device_map,
)
tokenizer = LlamaTokenizer.from_pretrained(base_model)
tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left" # Allow batched inference
def tokenize(prompt, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)
result["labels"] = result["input_ids"].copy()
return result
def generate_and_tokenize_prompt(data_point):
full_prompt = prompter.generate_prompt(
data_point["instruction"],
data_point["input"],
data_point["output"],
)
tokenized_full_prompt = tokenize(full_prompt)
if not train_on_inputs:
user_prompt = prompter.generate_prompt(
data_point["instruction"], data_point["input"]
)
tokenized_user_prompt = tokenize(
user_prompt, add_eos_token=add_eos_token
)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos_token:
user_prompt_len -= 1
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt
model = prepare_model_for_int8_training(model)
config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=lora_target_modules,
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
if data_path.endswith(".json") or data_path.endswith(".jsonl"):
data = load_dataset("json", data_files=data_path)
else:
data = load_dataset(data_path)
if resume_from_checkpoint:
# Check the available weights and load them
checkpoint_name = os.path.join(
resume_from_checkpoint, "pytorch_model.bin"
) # Full checkpoint
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
adapters_weights = torch.load(checkpoint_name)
set_peft_model_state_dict(model, adapters_weights)
else:
print(f"Checkpoint {checkpoint_name} not found")
model.print_trainable_parameters() # Be more transparent about the % of trainable params.
if val_set_size > 0:
train_val = data["train"].train_test_split(
test_size=val_set_size, shuffle=True, seed=42
)
train_data = (
train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)
val_data = (
train_val["test"].shuffle().map(generate_and_tokenize_prompt)
)
else:
train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
val_data = None
if not ddp and torch.cuda.device_count() > 1:
# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
model.is_parallelizable = True
model.model_parallel = True
trainer = transformers.Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=transformers.TrainingArguments(
per_device_train_batch_size=micro_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=100,
num_train_epochs=num_epochs,
learning_rate=learning_rate,
fp16=True,
logging_steps=10,
optim="adamw_torch",
evaluation_strategy="steps" if val_set_size > 0 else "no",
save_strategy="steps",
eval_steps=None,
save_steps=8000,
output_dir=output_dir,
save_total_limit=2,
load_best_model_at_end=True if val_set_size > 0 else False,
ddp_find_unused_parameters=False if ddp else None,
group_by_length=group_by_length,
report_to=None,
run_name=None,
),
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
lambda self, *_, **__: get_peft_model_state_dict(
self, old_state_dict()
)
).__get__(model, type(model))
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
model.save_pretrained(output_dir)
print(
"\n If there's a warning about missing keys above, please disregard :)"
)
if __name__ == "__main__":
fire.Fire(train)

279
finetune_kopa.py Normal file
View File

@ -0,0 +1,279 @@
import os
import sys
from typing import List
import fire
import torch
import transformers
from datasets import load_dataset
from kopa import KoPA, KoPAWithAdapter
"""
Unused imports:
import torch.nn as nn
import bitsandbytes as bnb
"""
from peft import (
LoraConfig,
get_peft_model,
get_peft_model_state_dict,
prepare_model_for_int8_training,
set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer
from utils.prompter import Prompter
def train(
# model/data params
base_model: str = "", # the only required argument
data_path: str = "YOUR LLM PATH",
output_dir: str = "./lora-alpaca",
# training hyperparams
batch_size: int = 16,
micro_batch_size: int = 16,
num_epochs: int = 2,
learning_rate: float = 3e-4,
cutoff_len: int = 512,
val_set_size: int = 0,
# lora hyperparams
lora_r: int = 16,
lora_alpha: int = 16,
lora_dropout: float = 0.05,
lora_target_modules: List[str] = [
"q_proj",
"v_proj",
],
num_prefix: int = 1,
# llm hyperparams
train_on_inputs: bool = True, # if False, masks out inputs in loss
add_eos_token: bool = False,
group_by_length: bool = False, # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = "",
wandb_run_name: str = "",
wandb_watch: str = "", # options: false | gradients | all
wandb_log_model: str = "", # options: false | true
resume_from_checkpoint: str = None, # either training checkpoint or final adapter
prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca.
kge_model: str = "data/CoDeX-S.pth"
):
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
print(
f"Training Alpaca-LoRA model with params:\n"
f"base_model: {base_model}\n"
f"data_path: {data_path}\n"
f"output_dir: {output_dir}\n"
f"batch_size: {batch_size}\n"
f"micro_batch_size: {micro_batch_size}\n"
f"num_epochs: {num_epochs}\n"
f"learning_rate: {learning_rate}\n"
f"cutoff_len: {cutoff_len}\n"
f"val_set_size: {val_set_size}\n"
f"lora_r: {lora_r}\n"
f"num_prefix: {num_prefix}\n"
f"lora_alpha: {lora_alpha}\n"
f"lora_dropout: {lora_dropout}\n"
f"lora_target_modules: {lora_target_modules}\n"
f"train_on_inputs: {train_on_inputs}\n"
f"add_eos_token: {add_eos_token}\n"
f"group_by_length: {group_by_length}\n"
f"wandb_project: {wandb_project}\n"
f"wandb_run_name: {wandb_run_name}\n"
f"wandb_watch: {wandb_watch}\n"
f"wandb_log_model: {wandb_log_model}\n"
f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
f"prompt template: {prompt_template_name}\n"
f"kge model: {kge_model}\n"
)
assert (
base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
gradient_accumulation_steps = batch_size // micro_batch_size
prompter = Prompter(prompt_template_name)
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
gradient_accumulation_steps = gradient_accumulation_steps // world_size
model = LlamaForCausalLM.from_pretrained(
base_model,
# load_in_8bit=True,
torch_dtype=torch.float16,
device_map=device_map,
)
tokenizer = LlamaTokenizer.from_pretrained(base_model)
tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left" # Allow batched inference
def tokenize(prompt, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)
result["labels"] = result["input_ids"].copy()
return result
def generate_and_tokenize_prompt(data_point):
full_prompt = prompter.generate_prompt(
data_point["instruction"],
data_point["input"],
data_point["output"],
)
tokenized_full_prompt = tokenize(full_prompt)
if not train_on_inputs:
user_prompt = prompter.generate_prompt(
data_point["instruction"], data_point["input"]
)
tokenized_user_prompt = tokenize(
user_prompt, add_eos_token=add_eos_token
)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos_token:
user_prompt_len -= 1
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt
# model = prepare_model_for_int8_training(model)
config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=lora_target_modules,
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
slama_model = KoPAWithAdapter(model, num_prefix, kge_model=kge_model)
if data_path.endswith(".json") or data_path.endswith(".jsonl"):
data = load_dataset("json", data_files=data_path, cache_dir="/data/zhangyichi/.huggingface/")
else:
data = load_dataset(data_path, cache_dir="/data/zhangyichi/.huggingface/")
if resume_from_checkpoint:
# Check the available weights and load them
checkpoint_name = os.path.join(
resume_from_checkpoint, "pytorch_model.bin"
) # Full checkpoint
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
adapters_weights = torch.load(checkpoint_name)
set_peft_model_state_dict(model, adapters_weights)
else:
print(f"Checkpoint {checkpoint_name} not found")
# model.print_trainable_parameters() # Be more transparent about the % of trainable params.
if val_set_size > 0:
train_val = data["train"].train_test_split(
test_size=val_set_size, shuffle=True, seed=42
)
train_data = (
train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)
val_data = (
train_val["test"].shuffle().map(generate_and_tokenize_prompt)
)
else:
train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
val_data = None
if not ddp and torch.cuda.device_count() > 1:
# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
model.is_parallelizable = True
model.model_parallel = True
trainer = transformers.Trainer(
model=slama_model,
train_dataset=train_data,
eval_dataset=val_data,
args=transformers.TrainingArguments(
per_device_train_batch_size=micro_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=100,
num_train_epochs=num_epochs,
learning_rate=learning_rate,
fp16=True,
logging_steps=10,
optim="adamw_hf",
evaluation_strategy="steps" if val_set_size > 0 else "no",
save_strategy="steps",
eval_steps=None,
save_steps=5000,
output_dir=output_dir,
save_total_limit=2,
load_best_model_at_end=True if val_set_size > 0 else False,
ddp_find_unused_parameters=False if ddp else None,
group_by_length=group_by_length,
report_to=None,
run_name=None,
),
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
lambda self, *_, **__: get_peft_model_state_dict(
self, old_state_dict()
)
).__get__(model, type(model))
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
model.save_pretrained(output_dir)
torch.save(slama_model.embeddings, os.path.join(output_dir, "embeddings.pth"))
print(
"\n If there's a warning about missing keys above, please disregard :)"
)
if __name__ == "__main__":
fire.Fire(train)

95
inference_kopa.py Normal file
View File

@ -0,0 +1,95 @@
import os
import json
import torch
import transformers
from peft import PeftModel
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
base_path = 'YOUR LLM PATH'
prompt_template = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Given a triple from a knowledge graph. Each triple consists of a head entity, a relation, and a tail entity. Please determine the correctness of the triple and response True or False.
### Input:
{}
### Response:
"""
def load_test_dataset(path):
test_dataset = json.load(open(path, "r"))
return test_dataset
if __name__ == "__main__":
cuda = "cuda:0"
lora_weights = "YOUR SAVE PATH"
test_data_path = "data/UMLS-test.json"
embedding_path = "{}/embeddings.pth".format(lora_weights)
test_dataset = load_test_dataset(test_data_path)
kg_embeddings = torch.load(embedding_path).to(cuda)
tokenizer = LlamaTokenizer.from_pretrained(base_path)
model = LlamaForCausalLM.from_pretrained(
base_path,
torch_dtype=torch.float16
).to(cuda)
model = PeftModel.from_pretrained(
model,
lora_weights,
torch_dtype=torch.float16,
).to(cuda)
# unwind broken decapoda-research config
model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2
model = model.eval()
result = []
for data in test_dataset:
ent = data["input"]
ans = data["output"]
ids = data["embedding_ids"]
ids = torch.LongTensor(ids).reshape(1, -1).to(cuda)
prefix = kg_embeddings(ids)
prompt = prompt_template.format(ent)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids.to(cuda)
token_embeds = model.model.model.embed_tokens(input_ids)
input_embeds = torch.cat((prefix, token_embeds), dim=1)
generate_ids = model.generate(
inputs_embeds=input_embeds,
max_new_tokens=16
)
context = tokenizer.batch_decode(input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
response = response.replace(context, "").strip()
print(response + '\n')
result.append(
{
"answer": ans,
"predict": response
}
)
answer = []
predict = []
for data in result:
if "True" in data["answer"]:
answer.append(1)
else:
answer.append(0)
if "True" in data["predict"]:
predict.append(1)
else:
predict.append(0)
acc = accuracy_score(y_true=answer, y_pred=predict)
p = precision_score(y_true=answer, y_pred=predict)
r = recall_score(y_true=answer, y_pred=predict)
f1 = f1_score(y_true=answer, y_pred=predict)
print(acc, p, r, f1)

181
kopa.py Normal file
View File

@ -0,0 +1,181 @@
import torch
import torch.nn as nn
from typing import Optional, List, Union, Tuple
from transformers import LlamaForCausalLM
from process_kge import load_pretrain_kge
class KoPA(nn.Module):
def __init__(
self,
model: LlamaForCausalLM
) -> None:
super(KoPA, self).__init__()
self.llama_model = model
# self.embeddings = nn.Embedding(100, 4096)
self.embeddings = PrefixKGEmbedding(
num_ent=2034,
num_rel=42,
dim_llm=4096,
num_prefix=1
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
embedding_ids: torch.LongTensor = None
):
kg_embeds = self.embeddings(embedding_ids)
batch_size, seq_len, _ = kg_embeds.shape
token_embeds = self.llama_model.model.model.embed_tokens(input_ids)
input_embeds = torch.cat((kg_embeds, token_embeds), dim=1)
prefix_mask = torch.ones((batch_size, seq_len))
prefix_labels = torch.full((batch_size, seq_len), fill_value=-100, dtype=torch.long)
new_attention_mask = torch.cat((prefix_mask.cuda(), attention_mask), dim=-1)
new_labels = torch.cat((prefix_labels.cuda(), labels), dim=-1)
return self.llama_model(
input_ids=None,
attention_mask=new_attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=input_embeds,
labels=new_labels,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class KoPAWithAdapter(nn.Module):
def __init__(
self,
model: LlamaForCausalLM,
num_prefix: int,
kge_model: str = "data/UMLS-rotate.pth",
pretrain_emb_path = None
) -> None:
super(KoPAWithAdapter, self).__init__()
self.llama_model = model
ent_embs, rel_embs = load_pretrain_kge(kge_model)
if pretrain_emb_path is None:
print("Adapter Trained From Scratch".format(pretrain_emb_path))
self.embeddings = PretrainKGEmbedding(
pretrain_ent_embs=ent_embs,
pretrain_rel_embs=rel_embs,
dim_llm=4096,
num_prefix=num_prefix
)
else:
print("Adapter Load From {}".format(pretrain_emb_path))
self.embeddings = torch.load(pretrain_emb_path)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
embedding_ids: torch.LongTensor = None
):
kg_embeds = self.embeddings(embedding_ids)
# print(kg_embeds.shape)
batch_size, seq_len, _ = kg_embeds.shape
token_embeds = self.llama_model.model.model.embed_tokens(input_ids)
input_embeds = torch.cat((kg_embeds, token_embeds), dim=1)
prefix_mask = torch.ones((batch_size, seq_len))
prefix_labels = torch.full((batch_size, seq_len), fill_value=-100, dtype=torch.long)
new_attention_mask = torch.cat((prefix_mask.cuda(), attention_mask), dim=-1)
new_labels = torch.cat((prefix_labels.cuda(), labels), dim=-1)
return self.llama_model(
input_ids=None,
attention_mask=new_attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=input_embeds,
labels=new_labels,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class PrefixKGEmbedding(nn.Module):
def __init__(
self,
num_ent,
num_rel,
dim_llm,
num_prefix
):
super(PrefixKGEmbedding, self).__init__()
self.emb_dim = num_prefix * dim_llm
self.ent_embeddings = nn.Embedding(num_ent, self.emb_dim)
self.rel_embeddings = nn.Embedding(num_rel, self.emb_dim)
def forward(self, triple_ids):
head, relation, tail = triple_ids[:, 0], triple_ids[:, 1], triple_ids[:, 2]
h = self.ent_embeddings(head)
r = self.rel_embeddings(relation)
t = self.ent_embeddings(tail)
prefix = torch.stack((h, r, t), dim=1)
return prefix
class PretrainKGEmbedding(nn.Module):
def __init__(
self,
pretrain_ent_embs,
pretrain_rel_embs,
dim_llm,
num_prefix
):
super(PretrainKGEmbedding, self).__init__()
self.num_prefix = num_prefix
self.llm_dim = dim_llm
self.emb_dim = num_prefix * dim_llm
self.ent_embeddings = nn.Embedding.from_pretrained(pretrain_ent_embs)
self.rel_embeddings = nn.Embedding.from_pretrained(pretrain_rel_embs)
self.pretrain_dim = self.ent_embeddings.weight.shape[1]
# Froze the pretrain embeddings
self.ent_embeddings.requires_grad_(False)
self.rel_embeddings.requires_grad_(False)
self.adapter = nn.Linear(self.pretrain_dim, self.emb_dim)
def forward(self, triple_ids):
# main training stage
if triple_ids.shape[1] == 3:
head, relation, tail = triple_ids[:, 0], triple_ids[:, 1], triple_ids[:, 2]
h = self.ent_embeddings(head)
r = self.rel_embeddings(relation)
t = self.ent_embeddings(tail)
pretrain_embs = torch.stack((h, r, t), dim=1)
prefix = self.adapter(pretrain_embs).reshape(-1, 3*self.num_prefix, self.llm_dim)
return prefix
# entity-aware pre-funing
else:
ent = triple_ids.reshape(-1,)
emb = self.ent_embeddings(ent)
prefix = self.adapter(emb).reshape(-1, self.num_prefix, self.llm_dim)
# print(prefix.shape)
return prefix

39
process_kge.py Normal file
View File

@ -0,0 +1,39 @@
import torch
def load_pretrain_kge(path):
if "complex" in path:
return load_complex_model(path)
kge_model = torch.load(path)
ent_embs = torch.tensor(kge_model["ent_embeddings.weight"]).cpu()
rel_embs = torch.tensor(kge_model["rel_embeddings.weight"]).cpu()
ent_embs.requires_grad = False
rel_embs.requires_grad = False
ent_dim = ent_embs.shape[1]
rel_dim = rel_embs.shape[1]
print(ent_dim, rel_dim)
if ent_dim != rel_dim:
rel_embs = torch.cat((rel_embs, rel_embs), dim=-1)
# print(ent_embs.shape, rel_embs.shape)
# print(ent_embs.requires_grad, rel_embs.requires_grad)
return ent_embs, rel_embs
def load_complex_model(path):
kge_model = torch.load(path)
ent_embs1 = torch.tensor(kge_model["ent_re_embeddings.weight"]).cpu()
ent_embs2 = torch.tensor(kge_model["ent_im_embeddings.weight"]).cpu()
rel_embs1 = torch.tensor(kge_model["rel_re_embeddings.weight"]).cpu()
rel_embs2 = torch.tensor(kge_model["rel_im_embeddings.weight"]).cpu()
ent_embs = torch.cat((ent_embs1, ent_embs2), dim=-1)
rel_embs = torch.cat((rel_embs1, rel_embs2), dim=-1)
ent_embs.requires_grad = False
rel_embs.requires_grad = False
ent_dim = ent_embs.shape[1]
rel_dim = rel_embs.shape[1]
print(ent_dim, rel_dim)
return ent_embs, rel_embs
if __name__ == "__main__":
load_pretrain_kge("data/CoDeX-S-complex.pth")

12
requirements.txt Normal file
View File

@ -0,0 +1,12 @@
accelerate
appdirs
loralib
bitsandbytes
black
black[jupyter]
datasets
fire
git+https://github.com/huggingface/peft.git
transformers>=4.28.0
sentencepiece
gradio

13
scripts/run_umls_base.sh Normal file
View File

@ -0,0 +1,13 @@
export WANDB_DISABLED=true
wandb offline
CUDA_VISIBLE_DEVICES=1 nohup python finetune.py \
--base_model 'YOUR LLM PATH' \
--data_path 'preprocess/UMLS-train.json' \
--output_dir 'YOUR SAVE PATH' \
--num_epochs 3 \
--batch_size 12 \
--micro_batch_size 12 \
--lora_r 32 \
--learning_rate 3e-4 \
--lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' > log.txt &

14
scripts/run_umls_kopa.sh Normal file
View File

@ -0,0 +1,14 @@
export WANDB_DISABLED=true
wandb offline
CUDA_VISIBLE_DEVICES=0 nohup python finetune_kopa.py \
--base_model 'YOUR LLM PATH' \
--data_path 'data/UMLS-train.json' \
--output_dir 'YOUR SAVE PATH' \
--num_epochs 3 \
--lora_r 32 \
--learning_rate 3e-4 \
--batch_size 12 \
--micro_batch_size 12 \
--num_prefix 1 \
--kge_model 'data/UMLS-rotate.pth' \
--lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' > log.txt &

46
templates/README.md Normal file
View File

@ -0,0 +1,46 @@
# Prompt templates
This directory contains template styles for the prompts used to finetune LoRA models.
## Format
A template is described via a JSON file with the following keys:
- `prompt_input`: The template to use when input is not None. Uses `{instruction}` and `{input}` placeholders.
- `prompt_no_input`: The template to use when input is None. Uses `{instruction}` placeholders.
- `description`: A short description of the template, with possible use cases.
- `response_split`: The text to use as separator when cutting real response from the model output.
No `{response}` placeholder was used, since the response is always the last element of the template and is just to be concatenated to the rest.
## Example template
The default template, used unless otherwise specified, is `alpaca.json`
```json
{
"description": "Template used by Alpaca-LoRA.",
"prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
"prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
"response_split": "### Response:"
}
```
## Current templates
### alpaca
Default template used for generic LoRA fine tunes so far.
### alpaca_legacy
Legacy template used by the original alpaca repo, with no `\n` after the response field. Kept for reference and experiments.
### alpaca_short
A trimmed down alpaca template which seems to perform just as well and spare some tokens. Models created with the default template seem to be queryable by the short tempalte as well. More experiments are welcome.
### vigogne
The default alpaca template, translated to french. This template was used to train the "Vigogne" LoRA and is to be used to query it, or for extra fine tuning.

6
templates/alpaca.json Normal file
View File

@ -0,0 +1,6 @@
{
"description": "Template used by Alpaca-LoRA.",
"prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
"prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
"response_split": "### Response:"
}

View File

@ -0,0 +1,6 @@
{
"description": "Legacy template, used by Original Alpaca repository.",
"prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:",
"prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:",
"response_split": "### Response:"
}

View File

@ -0,0 +1,6 @@
{
"description": "A shorter template to experiment with.",
"prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
"prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n",
"response_split": "### Response:"
}

6
templates/vigogne.json Normal file
View File

@ -0,0 +1,6 @@
{
"description": "French template, used by Vigogne for finetuning.",
"prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n",
"prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n",
"response_split": "### Réponse:"
}

0
utils/__init__.py Normal file
View File

75
utils/callbacks.py Normal file
View File

@ -0,0 +1,75 @@
"""
Helpers to support streaming generate output.
Borrowed from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/callbacks.py
"""
import gc
import traceback
from queue import Queue
from threading import Thread
import torch
import transformers
class Stream(transformers.StoppingCriteria):
def __init__(self, callback_func=None):
self.callback_func = callback_func
def __call__(self, input_ids, scores) -> bool:
if self.callback_func is not None:
self.callback_func(input_ids[0])
return False
class Iteratorize:
"""
Transforms a function that takes a callback
into a lazy iterator (generator).
"""
def __init__(self, func, kwargs={}, callback=None):
self.mfunc = func
self.c_callback = callback
self.q = Queue()
self.sentinel = object()
self.kwargs = kwargs
self.stop_now = False
def _callback(val):
if self.stop_now:
raise ValueError
self.q.put(val)
def gentask():
try:
ret = self.mfunc(callback=_callback, **self.kwargs)
except ValueError:
pass
except:
traceback.print_exc()
pass
self.q.put(self.sentinel)
if self.c_callback:
self.c_callback(ret)
self.thread = Thread(target=gentask)
self.thread.start()
def __iter__(self):
return self
def __next__(self):
obj = self.q.get(True, None)
if obj is self.sentinel:
raise StopIteration
else:
return obj
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop_now = True

51
utils/prompter.py Normal file
View File

@ -0,0 +1,51 @@
"""
A dedicated helper to manage templates and prompt building.
"""
import json
import os.path as osp
from typing import Union
class Prompter(object):
__slots__ = ("template", "_verbose")
def __init__(self, template_name: str = "", verbose: bool = False):
self._verbose = verbose
if not template_name:
# Enforce the default here, so the constructor can be called with '' and will not break.
template_name = "alpaca"
file_name = osp.join("templates", f"{template_name}.json")
if not osp.exists(file_name):
raise ValueError(f"Can't read {file_name}")
with open(file_name) as fp:
self.template = json.load(fp)
if self._verbose:
print(
f"Using prompt template {template_name}: {self.template['description']}"
)
def generate_prompt(
self,
instruction: str,
input: Union[None, str] = None,
label: Union[None, str] = None,
) -> str:
# returns the full prompt from instruction and optional input
# if a label (=response, =output) is provided, it's also appended.
if input:
res = self.template["prompt_input"].format(
instruction=instruction, input=input
)
else:
res = self.template["prompt_no_input"].format(
instruction=instruction
)
if label:
res = f"{res}{label}"
if self._verbose:
print(res)
return res
def get_response(self, output: str) -> str:
return output.split(self.template["response_split"])[1].strip()