add InterLM2_7B eval

2024-03-03 17:20:16 +08:00 · 2024-03-03 17:20:16 +08:00 · d825895a79
commit d825895a79
parent 84581eb9e0
8 changed files with 124 additions and 22 deletions
--- a/evaluate/General
+++ b/evaluate/General
@ -43,13 +43,9 @@ pip install torch transformers datasets nltk rouge jieba

 ## 测试结果

-基于全量微调后的Qwen1_5-0_5B-Chat模型对data.json中的数据进行测试，结果如下：
-| Metric  | Value                |
-|---------|----------------------|
-| ROUGE-1 | 27.23%               |
-| ROUGE-2 | 8.55%                |
-| ROUGE-L | 17.05%               |
-| BLEU-1  | 26.65%               |
-| BLEU-2  | 13.11%               |
-| BLEU-3  | 7.19%                |
-| BLEU-4  | 4.05%                |
+对data.json中的数据进行测试，结果如下：
+
+| Model    | ROUGE-1 | ROUGE-2 | ROUGE-L | BLEU-1  | BLEU-2  | BLEU-3  | BLEU-4  |
+|----------|---------|---------|---------|---------|---------|---------|---------|
+| Qwen1_5-0_5B-Chat | 27.23%  | 8.55%   | 17.05%  | 26.65%  | 13.11%  | 7.19%   | 4.05%   |
+| InternLM2_7B_chat  | 37.86%  | 15.23%   | 24.34%  | 39.71%  | 22.66%  | 14.26%   | 9.21%   |
--- a/evaluate/InternLM2_7B_chat_eval.py
+++ b/evaluate/InternLM2_7B_chat_eval.py
@ -0,0 +1,111 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer,DataCollatorWithPadding
+from qwen_generation_utils import  decode_tokens
+import torch
+import datasets
+
+
+model_dir = './model'
+tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", padding_side='left',trust_remote_code=True)
+# Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
+model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto",pad_token_id=tokenizer.eos_token_id, trust_remote_code=True, torch_dtype=torch.float16)
+# (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
+  # InternLM 7B in 4bit will cost nearly 8GB GPU memory.
+  # pip install -U bitsandbytes
+  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
+  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
+model = model.eval()
+
+# # convert data
+# import ujson
+# def transform_conversation_data(raw_data):
+#     try:
+#         instruction = '<|im_start|>system\n'+raw_data.get("conversation", "")[0]['system'] + "<|im_end|>\n"
+
+#         conversation = raw_data.get("conversation", [])
+#         for i, dialog in enumerate(conversation):
+#             instruction += "<|im_start|>user\n来访者：" + dialog["input"]+ "<|im_end|>\n"
+
+#             if i < len(conversation) - 1:
+#                 instruction += "<|im_start|>assistant\n医生：" + dialog["output"]+"<|im_end|>\n"
+
+#         response = conversation[-1]["output"] if conversation else ""
+
+#         instruction +="<|im_start|>assistant\n医生："
+
+#         return {"instruction": instruction, "output": response}
+    
+#     except Exception as e:
+#         pass
+
+
+# with open(f'./data_dir/data.json', 'r', encoding='utf-8') as f1:
+#     data = ujson.load(f1)
+# with open(f'./data_dir/converted.json', 'w', encoding='utf-8') as f:
+#     for j, item in enumerate(data):
+#         temp=transform_conversation_data(item)
+#         if temp:
+#             transformed_data =ujson.dumps(temp, ensure_ascii=False)
+#             f.write(transformed_data+'\n')
+
+#set test params
+
+
+#set test params
+test_num=1596 #测试数据条数
+batch_size=12
+
+
+#prepare data and dataloader
+dataset = datasets.load_dataset('json', data_files='./data_dir/converted.json',split=f"train[:{test_num}]")
+references =dataset['output'][:test_num]
+
+hypotheses = []
+def preprocess(data):
+    length = list(map(len, data['instruction']))
+    model_inputs=tokenizer(data['instruction'], max_length=512, truncation=True )
+    labels=tokenizer(data['output'], padding=True,max_length=128, truncation=True )
+    model_inputs['labels']=labels['input_ids']
+    model_inputs['length'] = length
+    return model_inputs
+preprocessed_dataset = dataset.map(preprocess, batched=True,remove_columns=['instruction', 'output',])
+
+
+collator=DataCollatorWithPadding(tokenizer=tokenizer,)
+from torch.utils.data import DataLoader
+
+dataloader = DataLoader(preprocessed_dataset, batch_size=batch_size, collate_fn=collator)
+
+#generate responses
+stop_word="<|im_end|>"
+for batch in dataloader:
+    batch_input_ids = torch.LongTensor(batch['input_ids']).to(model.device)
+    batch_labels = batch['labels']
+    attention_mask = batch['attention_mask']
+    length = batch['length']
+    batch_out_ids = model.generate(
+        batch_input_ids.to(model.device),
+        return_dict_in_generate=False,
+        max_new_tokens=256,
+        do_sample=True,
+        temperature=0.1,
+        eos_token_id=92542
+    )
+    
+    padding_lens = [batch_input_ids[i].eq(tokenizer.pad_token_id).sum().item() for i in range(batch_input_ids.size(0))]
+    batch_response = [
+    decode_tokens(
+        batch_out_ids[i][padding_lens[i]:],
+        tokenizer,
+        context_length=0,
+        raw_text_len=length[i],
+        chat_format="raw",
+        verbose=False,
+        errors='replace'
+    ).replace("医生：","") for i in range(batch_size)]
+    hypotheses.extend([r.replace(stop_word," ").split()[0] for r in batch_response if stop_word in r])
+
+
+# Load metric
+from metric import compute_metrics
+
+print(compute_metrics((hypotheses,references)))
--- a/evaluate/Qwen1_5-0_5B-Chat_eval.py
+++ b/evaluate/Qwen1_5-0_5B-Chat_eval.py
@ -25,7 +25,7 @@ batch_size=12


 #prepare data and dataloader
-dataset = datasets.load_dataset('json', data_files='./train_dir/converted.json',split=f"train[:{test_num}]")
+dataset = datasets.load_dataset('json', data_files='./data_dir/converted.json',split=f"train[:{test_num}]")
 references =dataset['output'][:test_num]

 hypotheses = []
--- a/evaluate/README.md
+++ b/evaluate/README.md
@ -4,15 +4,10 @@

 * 具体指标、方法见 General evaluation.md

-| Metric  | Value                |
-|---------|----------------------|
-| ROUGE-1 | 27.23%               |
-| ROUGE-2 | 8.55%                |
-| ROUGE-L | 17.05%               |
-| BLEU-1  | 26.65%               |
-| BLEU-2  | 13.11%               |
-| BLEU-3  | 7.19%                |
-| BLEU-4  | 4.05%                |
+| Model    | ROUGE-1 | ROUGE-2 | ROUGE-L | BLEU-1  | BLEU-2  | BLEU-3  | BLEU-4  |
+|----------|---------|---------|---------|---------|---------|---------|---------|
+| Qwen1_5-0_5B-Chat | 27.23%  | 8.55%   | 17.05%  | 26.65%  | 13.11%  | 7.19%   | 4.05%   |
+| InternLM2_7B_chat  | 37.86%  | 15.23%   | 24.34%  | 39.71%  | 22.66%  | 14.26%   | 9.21%   |

 ## 专业指标评测

--- a/evaluate/train_dir/convert.py
+++ b/evaluate/train_dir/convert.py
--- a/evaluate/train_dir/converted.json
+++ b/evaluate/train_dir/converted.json
--- a/evaluate/train_dir/data.json
+++ b/evaluate/train_dir/data.json
--- a/evaluate/metric.py
+++ b/evaluate/metric.py
@ -18,8 +18,8 @@ def compute_metrics(eval_pred):

    rouge = Rouge()

-    bleu =np.array([0,0,0,0])
-    weights = [(1.,0,0,0),(1./2., 1./2.),(1./3., 1./3., 1./3.),(1./4., 1./4., 1./4., 1./4.)]
+    bleu =np.array([0.,0.,0.,0.])
+    weights = [(1.,0.,0.,0.),(1./2., 1./2.),(1./3., 1./3., 1./3.),(1./4., 1./4., 1./4., 1./4.)]
    for decoded_label, decoded_pred in zip(decoded_labels, decoded_preds):
        bleu +=np.array( sentence_bleu(
            references=[decoded_label.split(' ')],