RAG - Initial commit

2024-03-07 18:05:10 +08:00 · 2024-03-07 18:05:10 +08:00 · 1ca8349839
commit 1ca8349839
parent 18997ec79c
6 changed files with 147 additions and 0 deletions
--- a/rag/README.md
+++ b/rag/README.md
--- a/rag/requirements.txt
+++ b/rag/requirements.txt
@ -0,0 +1,4 @@
 sentence_transformers
 transformers
 numpy
 loguru
--- a/rag/src/config/config.py
+++ b/rag/src/config/config.py
@ -0,0 +1,19 @@
 import os
 cur_dir = os.path.dirname(os.path.abspath(__file__))                # config
 src_dir = os.path.dirname(cur_dir)                                  # src
 base_dir = os.path.dirname(src_dir)                                 # base
 # model
 model_dir = os.path.join(base_dir, 'model')                         # model
 embedding_path = os.path.join(model_dir, 'gte-small-zh')            # embedding
 llm_path = os.path.join(model_dir, 'pythia-14m')                    # llm
 # data
 data_dir = os.path.join(base_dir, 'data')                           # data
 knowledge_json_path = os.path.join(data_dir, 'knowledge.json')      # json
 knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl')        # pickle
 # log
 log_dir = os.path.join(base_dir, 'log')                             # log
 log_path = os.path.join(log_dir, 'log.log')                         # file
--- a/rag/src/main.py
+++ b/rag/src/main.py
@ -0,0 +1,67 @@
 import os
 import json
 import pickle
 import numpy as np
 from typing import Tuple
 from sentence_transformers import SentenceTransformer
 from config.config import knowledge_json_path, knowledge_pkl_path
 from util.encode import load_embedding, encode_qa
 """
 读取知识库
 """
 def load_knowledge() -> Tuple[list, list]:
    # 如果 pkl 不存在，则先编码存储
    if not os.path.exists(knowledge_pkl_path):
        encode_qa(knowledge_json_path, knowledge_pkl_path)
    # 加载 json 和 pkl
    with open(knowledge_json_path, 'r', encoding='utf-8') as f1, open(knowledge_pkl_path, 'rb') as f2:
        knowledge = json.load(f1)
        encoded_knowledge = pickle.load(f2)
    return knowledge, encoded_knowledge
 """
 召回 top_k 个相关的文本段
 """
 def find_top_k(
    emb: SentenceTransformer,
    query: str,
    knowledge: list,
    encoded_knowledge: list,
    k=3
 ) -> list[str]:
    # 编码 query
    query_embedding = emb.encode(query)
    # 查找 top_k
    scores = query_embedding @ encoded_knowledge.T
    # 使用 argpartition 找出每行第 k 个大的值的索引，第 k 个位置左侧都是比它大的值，右侧都是比它小的值
    top_k_indices = np.argpartition(scores, -k)[-k:]
    # 由于 argpartition 不保证顺序，我们需要对提取出的 k 个索引进行排序
    top_k_values_sorted_indices = np.argsort(scores[top_k_indices])[::-1]
    top_k_indices = top_k_indices[top_k_values_sorted_indices]
    # 返回
    contents = [knowledge[index] for index in top_k_indices]
    return contents
 def main():
    emb = load_embedding()
    knowledge, encoded_knowledge = load_knowledge()
    query = "认知心理学研究哪些心理活动？"
    contents = find_top_k(emb, query, knowledge, encoded_knowledge, 2)
    print('召回的 top-k 条相关内容如下：')
    print(json.dumps(contents, ensure_ascii=False, indent=2))
    # 这里我没实现 LLM 部分，如果有 LLM
    ## 1. 读取 LLM
    ## 2. 将 contents 拼接为 prompt，传给 LLM，作为 {已知内容}
    ## 3. 要求 LLM 根据已知内容回复
 if __name__ == '__main__':
    main()
--- a/rag/src/util/encode.py
+++ b/rag/src/util/encode.py
@ -0,0 +1,57 @@
 import json
 import pickle
 from loguru import logger
 from sentence_transformers import SentenceTransformer
 from config.config import embedding_path
 """
 加载向量模型
 """
 def load_embedding() -> SentenceTransformer:
    logger.info('Loading embedding...')
    emb = SentenceTransformer(embedding_path)
    logger.info('Embedding loaded.')
    return emb
 """
 文本编码
 """
 def encode_raw_corpus(file_path: str, store_path: str) -> None:
    emb = load_embedding()
    with open(file_path, 'r', encoding='utf-8') as f:
        read_lines = f.readlines()
    """
    对文本分割（例如：按句子分割）
    """
    lines = []
    # 分割好后的存入 lines 中
    # 编码（转换为向量）
    encoded_knowledge = emb.encode(lines)
    with open(store_path, 'wb') as f:
        pickle.dump(encoded_knowledge, f)
 """
 QA 对编码
 暂时只实现了加载 json，csv和txt先没写
 """
 def encode_qa(file_path: str, store_path: str) -> None:
    emb = load_embedding()
    with open(file_path, 'r', encoding='utf-8') as f:
        qa_list = json.load(f)
    # 将 QA 对拼起来作为完整一句来编码，也可以只编码 Q
    lines = []
    for qa in qa_list:
        question = qa['question']
        answer = qa['answer']
        lines.append(question + answer)
    encoded_knowledge = emb.encode(lines)
    with open(store_path, 'wb') as f:
        pickle.dump(encoded_knowledge, f)
--- a/rag/src/util/llm.py
+++ b/rag/src/util/llm.py