Merge branch 'main' of https://github.com/chg0901/EmoLLM

2024-05-04 12:18:32 +09:00 · 2024-05-04 12:18:32 +09:00 · 85c36244ad
commit 85c36244ad
parent c5d3e8f834 0aa6079770
6 changed files with 267 additions and 18 deletions
--- a/rag/README.md
+++ b/rag/README.md
@ -19,6 +19,8 @@ langchain_openai==0.0.8
 langchain_text_splitters==0.0.1
 FlagEmbedding==1.2.8
 unstructured==0.12.6
 PyJWT
 faiss-gpu  # faiss-cpu for device without gpu
 ```
 ```python
@ -32,10 +34,18 @@ pip3 install -r requirements.txt
 ### 准备数据
- txt数据：放入到 src.data.txt 目录下
+#### 搭建自己的 Vector DB
- json 数据：放入到 src.data.json 目录下
+
 ##### TXT 数据
 将需要构建的知识库转化为 Txt 文件放入到 src.data.txt 目录下
 ##### JSON 数据
 构建 QA 对并生成 JSON 文件（多轮对话），放入到 src.data.json 目录下
 数据格式如下
 JSON 数据格式如下
 ```python
 [
    {
@ -53,18 +63,23 @@ JSON 数据格式如下
 ] 
 ```
-会根据准备的数据构建vector DB，最终会在 data 文件夹下产生名为 vector_db 的文件夹包含 index.faiss 和 index.pkl
+会根据准备的数据构建 vector DB，最终会在 data 文件夹下产生名为 vector_db 的文件夹包含 index.faiss 和 index.pkl。如果已经有 vector DB 则会直接加载对应数据库
-如果已经有 vector DB 则会直接加载对应数据库
+- 可以直接从 xlab 下载对应 DB（请在rag文件目录下执行对应 code）
 **注意**: 可以直接从 xlab 下载对应 DB（请在rag文件目录下执行对应 code）
 ```python
 # https://openxlab.org.cn/models/detail/Anooyman/EmoLLMRAGTXT/tree/main
 git lfs install
 git clone https://code.openxlab.org.cn/Anooyman/EmoLLMRAGTXT.git
 ```
 - 也可以从魔塔社区下载对应数据集
 ```python
 # https://www.modelscope.cn/datasets/Anooyman/EmoLLMRAGTXT/summary
 git clone https://www.modelscope.cn/datasets/Anooyman/EmoLLMRAGTXT.git
 ```
 ### 配置 config 文件
@ -106,7 +121,50 @@ prompt_template = """
 """
 ```
-### 调用
+### 本地调用
 *注意*
 由于 RAG code 已经集成到 `web_internlm2.py` 中，import 路径不再适用于本地调用
 因此需要如下调整对应 import 路径
 - src/data_processing.py
 ```python
 #from rag.src.config.config import (
 #    embedding_path,
 #    embedding_model_name,
 #    doc_dir, qa_dir,
 #    knowledge_pkl_path,
 #    data_dir,
 #    vector_db_dir,
 #    rerank_path,
 #    rerank_model_name,
 #    chunk_size,
 #    chunk_overlap
 #)
 from config.config import (
    embedding_path,
    embedding_model_name,
    doc_dir, qa_dir,
    knowledge_pkl_path,
    data_dir,
    vector_db_dir,
    rerank_path,
    rerank_model_name,
    chunk_size,
    chunk_overlap
 )
 ```
 - src/pipeline.py
 ```python
 #from rag.src.data_processing import Data_process
 #from rag.src.config.config import prompt_template 
 from data_processing import Data_process
 from config.config import prompt_template 
 ```
 修改 import 路径之后通过以下 code 执行
 ```python
 cd rag/src
@ -128,6 +186,13 @@ python main.py
 ## **相关组件**
 这里我们提供了BGE和BCEmbedding两种组合方式，更加推荐性能更加优异的BGE
 ### [BGE Github](https://github.com/FlagOpen/FlagEmbedding)
 - [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5): embedding 模型，用于构建 vector DB
 - [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large): rerank 模型，用于对检索回来的文章段落重排
 ### [BCEmbedding](https://github.com/netease-youdao/BCEmbedding?tab=readme-ov-file)
 - [bce-embedding-base_v1](https://hf-mirror.com/maidalun1020/bce-embedding-base_v1): embedding 模型，用于构建 vector DB
@ -157,13 +222,13 @@ RAG的经典评估框架，通过以下三个方面进行评估:
 ### RAG具体流程
- 根据数据集构建vector DB
+- 根据数据集构建 vector DB
- 对用户输入的问题进行embedding
+- 对用户输入的问题进行 embedding
- 基于embedding结果在向量数据库中进行检索
+- 基于 embedding 结果在向量数据库中进行检索
 - 对召回数据重排序
 - 依据用户问题和召回数据生成最后的结果
-**Noted**: 当用户选择使用RAG时才会进行上述流程
+**Note**: 当用户选择使用RAG时才会进行上述流程
 ### 后续增强
--- a/rag/README_EN.md
+++ b/rag/README_EN.md
@ -23,6 +23,13 @@ For details on data collection construction, please refer to [qa_generation_READ
 ## **Components**
 There are two sets of embedding and rerank solutions, i.e., the BGE and BCE, we recommend to use the more powerful **BGE** !
 ### [BGE Github](https://github.com/FlagOpen/FlagEmbedding)
 - [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5): embedding model, used to build vector DB
 - [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large): rerank model, used to rerank retrieved documents 
 ### [BCEmbedding](https://github.com/netease-youdao/BCEmbedding?tab=readme-ov-file)
 - [bce-embedding-base_v1](https://hf-mirror.com/maidalun1020/bce-embedding-base_v1): embedding model, used to build vector DB
@ -63,4 +70,4 @@ Later, more evaluation indicators were added, such as: context recall, etc.
 - Add RAGAS evaluation results to the generation process. For example, when the generated results cannot solve the user's problem, it needs to be regenerated.
 - Add web retrieval to deal with the problem that the corresponding information cannot be retrieved in vector DB
- Add multi-channel retrieval to increase recall rate. That is, multiple similar queries are generated based on user input for retrieval.
+- Add multi-channel retrieval to increase recall rate. That is, multiple similar queries are generated based on user input for retrieval.
--- a/rag/pdf2md/README.md
+++ b/rag/pdf2md/README.md
@ -0,0 +1,94 @@
 # PDF2MD for RAG
 ## 使用api_key使用PDF2MD
 通过使用doc2x的库，实现将pdf文件转换为结构化md文档。
 通过代码调用(需要提供api_key)：
 ~~~python
 import requests as rq
 import json
 import os
 import zipfile
 class PDF2MD:
    def __init__(self, api_key):
        self.api_key = api_key
        self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
        self.export_url = "https://api.doc2x.noedgeai.com/api/export"
    def convert(self, filepath, to="md"):
        filename = os.path.splitext(os.path.basename(filepath))[0]
        res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True)
        if res.status_code == 200:
            txt_path = filename + ".txt"
            with open(txt_path, "w", encoding="utf-8") as f:
                for line in res.iter_lines():
                    if len(line) > 0:
                        decoded_line = line.decode("utf-8")
                        f.write(decoded_line + "\n")
                        print(decoded_line)
            uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
            print(uuid)
            if to == "md" or to == 'latex':
                path = filename + '.zip'
            elif to == 'docx':
                path = filename + '.docx'
            export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
            res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
            if res.status_code == 200:
                with open(path, "wb") as f:
                    f.write(res.content)
                print("下载成功,存入:", path)
                if to == "md" or to == 'latex':
                    zip_file = zipfile.ZipFile(path)
                    # 创建以原始文件名命名的文件夹
                    if not os.path.exists(filename):
                        os.mkdir(filename)
                    # 解压到该文件夹内
                    for names in zip_file.namelist():
                        zip_file.extract(names, filename)
                    zip_file.close()
                    # 找到解压后的md文件
                    for file in os.listdir(filename):
                        if file.endswith(".md"):
                            extracted_md = os.path.join(filename, file)
                            break
                    # 重命名md文件
                    new_md_name = os.path.join(filename, filename+'.md')
                    os.rename(extracted_md, new_md_name)
                    print("解压并重命名md文件为:", new_md_name)
            else:
                print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
        else:
            print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
 def main():
    api_key = "sk-xxx"
    filepath = r"test.pdf"
    converter = PDF2MD(api_key)
    converter.convert(filepath, to="md")
 if __name__ == "__main__":
    main()
 ~~~
 ## 通过网页使用在线PDF2MD服务：
 doc2x在线服务地址：https://doc2x.noedgeai.com
--- a/rag/pdf2md/pdf2md.py
+++ b/rag/pdf2md/pdf2md.py
@ -0,0 +1,81 @@
 import requests as rq
 import json
 import os
 import zipfile
 class PDF2MD:
    def __init__(self, api_key):
        self.api_key = api_key
        self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
        self.export_url = "https://api.doc2x.noedgeai.com/api/export"
    def convert(self, filepath, to="md"):
        filename = os.path.splitext(os.path.basename(filepath))[0]
        res = rq.post(self.url, files={"file": open(filepath, "rb")},
                      headers={"Authorization": "Bearer " + self.api_key}, stream=True)
        if res.status_code == 200:
            txt_path = filename + ".txt"
            with open(txt_path, "w", encoding="utf-8") as f:
                for line in res.iter_lines():
                    if len(line) > 0:
                        decoded_line = line.decode("utf-8")
                        f.write(decoded_line + "\n")
                        print(decoded_line)
            uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
            print(uuid)
            if to == "md" or to == 'latex':
                path = filename + '.zip'
            elif to == 'docx':
                path = filename + '.docx'
            export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
            res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
            if res.status_code == 200:
                with open(path, "wb") as f:
                    f.write(res.content)
                print("下载成功,存入:", path)
                if to == "md" or to == 'latex':
                    zip_file = zipfile.ZipFile(path)
                    # 创建以原始文件名命名的文件夹
                    if not os.path.exists(filename):
                        os.mkdir(filename)
                    # 解压到该文件夹内
                    for names in zip_file.namelist():
                        zip_file.extract(names, filename)
                    zip_file.close()
                    # 找到解压后的md文件
                    for file in os.listdir(filename):
                        if file.endswith(".md"):
                            extracted_md = os.path.join(filename, file)
                            break
                    # 重命名md文件
                    new_md_name = os.path.join(filename, filename + '.md')
                    os.rename(extracted_md, new_md_name)
                    print("解压并重命名md文件为:", new_md_name)
            else:
                print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
        else:
            print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
 def main():
    api_key = "sk-xxx"
    filepath = r"test.pdf"
    converter = PDF2MD(api_key)
    converter.convert(filepath, to="md")
 if __name__ == "__main__":
    main()
--- a/rag/requirements.txt
+++ b/rag/requirements.txt
@ -9,4 +9,6 @@ langchain_core==0.1.33
 langchain_openai==0.0.8
 langchain_text_splitters==0.0.1
 FlagEmbedding==1.2.8
-unstructured==0.12.6
+unstructured==0.12.6
 PyJWT
 faiss-gpu  # faiss-cpu for device without gpu
--- a/rag/src/data_processing.py
+++ b/rag/src/data_processing.py
@ -145,9 +145,9 @@ class Data_process():
        split_docs = []
        logger.info(f'Loading txt files from {data_path}')
        if os.path.isdir(data_path):
-                loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True)
+            loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True)
-                docs = loader.load()
+            docs = loader.load()
-                split_docs = text_spliter.split_documents(docs)
+            split_docs = text_spliter.split_documents(docs)
        elif data_path.endswith('.txt'): 
            file_path = data_path
            logger.info(f'splitting file {file_path}')
@ -246,4 +246,4 @@ if __name__ == "__main__":
    logger.info("After reranking...")
    for i in range(len(scores)):
        logger.info(str(scores[i]) + '\n')
-        logger.info(passages[i])
+        logger.info(passages[i])