From 975783ecfb740be49c538a6093229575f87c92cc Mon Sep 17 00:00:00 2001 From: wql <626394316@qq.com> Date: Sat, 27 Apr 2024 22:35:32 +0800 Subject: [PATCH 01/11] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E4=BA=86pdf2md?= =?UTF-8?q?=E7=9A=84=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rag/pdf2md/README.md | 92 ++++++++++++++++++++++++++++++++++++++++++++ rag/pdf2md/pdf2md.py | 81 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 rag/pdf2md/README.md create mode 100644 rag/pdf2md/pdf2md.py diff --git a/rag/pdf2md/README.md b/rag/pdf2md/README.md new file mode 100644 index 0000000..3cd74c3 --- /dev/null +++ b/rag/pdf2md/README.md @@ -0,0 +1,92 @@ +## RAG pdf2md + +通过使用doc2x的库,实现将pdf文件转换为结构化md文档。 + +通过代码调用(需要提供api_key): + +~~~ +import requests as rq +import json +import os +import zipfile + +class PDF2MD: + def __init__(self, api_key): + self.api_key = api_key + self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf" + self.export_url = "https://api.doc2x.noedgeai.com/api/export" + + def convert(self, filepath, to="md"): + filename = os.path.splitext(os.path.basename(filepath))[0] + + res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True) + + if res.status_code == 200: + txt_path = filename + ".txt" + with open(txt_path, "w", encoding="utf-8") as f: + for line in res.iter_lines(): + if len(line) > 0: + decoded_line = line.decode("utf-8") + f.write(decoded_line + "\n") + print(decoded_line) + + uuid = json.loads(decoded_line.replace("data: ", ''))['uuid'] + print(uuid) + + if to == "md" or to == 'latex': + path = filename + '.zip' + elif to == 'docx': + path = filename + '.docx' + + export_url = self.export_url + "?request_id=" + uuid + "&to=" + to + res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key}) + + if res.status_code == 200: + with open(path, "wb") as f: + f.write(res.content) + print("下载成功,存入:", path) + + if to == "md" or to == 'latex': + zip_file = zipfile.ZipFile(path) + + # 创建以原始文件名命名的文件夹 + if not os.path.exists(filename): + os.mkdir(filename) + + # 解压到该文件夹内 + for names in zip_file.namelist(): + zip_file.extract(names, filename) + zip_file.close() + + # 找到解压后的md文件 + for file in os.listdir(filename): + if file.endswith(".md"): + extracted_md = os.path.join(filename, file) + break + + # 重命名md文件 + new_md_name = os.path.join(filename, filename+'.md') + os.rename(extracted_md, new_md_name) + print("解压并重命名md文件为:", new_md_name) + + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + + + +def main(): + api_key = "sk-xxx" + filepath = r"test.pdf" + converter = PDF2MD(api_key) + converter.convert(filepath, to="md") + + +if __name__ == "__main__": + main() +~~~ + +通过网页直接在线转: + +在线网页地址:https://doc2x.noedgeai.com \ No newline at end of file diff --git a/rag/pdf2md/pdf2md.py b/rag/pdf2md/pdf2md.py new file mode 100644 index 0000000..0e39a39 --- /dev/null +++ b/rag/pdf2md/pdf2md.py @@ -0,0 +1,81 @@ +import requests as rq +import json +import os +import zipfile + + +class PDF2MD: + def __init__(self, api_key): + self.api_key = api_key + self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf" + self.export_url = "https://api.doc2x.noedgeai.com/api/export" + + def convert(self, filepath, to="md"): + filename = os.path.splitext(os.path.basename(filepath))[0] + + res = rq.post(self.url, files={"file": open(filepath, "rb")}, + headers={"Authorization": "Bearer " + self.api_key}, stream=True) + + if res.status_code == 200: + txt_path = filename + ".txt" + with open(txt_path, "w", encoding="utf-8") as f: + for line in res.iter_lines(): + if len(line) > 0: + decoded_line = line.decode("utf-8") + f.write(decoded_line + "\n") + print(decoded_line) + + uuid = json.loads(decoded_line.replace("data: ", ''))['uuid'] + print(uuid) + + if to == "md" or to == 'latex': + path = filename + '.zip' + elif to == 'docx': + path = filename + '.docx' + + export_url = self.export_url + "?request_id=" + uuid + "&to=" + to + res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key}) + + if res.status_code == 200: + with open(path, "wb") as f: + f.write(res.content) + print("下载成功,存入:", path) + + if to == "md" or to == 'latex': + zip_file = zipfile.ZipFile(path) + + # 创建以原始文件名命名的文件夹 + if not os.path.exists(filename): + os.mkdir(filename) + + # 解压到该文件夹内 + for names in zip_file.namelist(): + zip_file.extract(names, filename) + zip_file.close() + + # 找到解压后的md文件 + for file in os.listdir(filename): + if file.endswith(".md"): + extracted_md = os.path.join(filename, file) + break + + # 重命名md文件 + new_md_name = os.path.join(filename, filename + '.md') + os.rename(extracted_md, new_md_name) + print("解压并重命名md文件为:", new_md_name) + + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + + +def main(): + api_key = "sk-xxx" + filepath = r"test.pdf" + converter = PDF2MD(api_key) + converter.convert(filepath, to="md") + + +if __name__ == "__main__": + main() \ No newline at end of file From 7996422c4d9d1716e2b71cdc7048767900413be3 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Mon, 29 Apr 2024 11:28:50 +0900 Subject: [PATCH 02/11] Update README.md (title and format) --- rag/pdf2md/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/rag/pdf2md/README.md b/rag/pdf2md/README.md index 3cd74c3..6f669b3 100644 --- a/rag/pdf2md/README.md +++ b/rag/pdf2md/README.md @@ -1,10 +1,12 @@ -## RAG pdf2md +# PDF2MD for RAG + +## 使用api_key使用PDF2MD 通过使用doc2x的库,实现将pdf文件转换为结构化md文档。 通过代码调用(需要提供api_key): -~~~ +~~~python import requests as rq import json import os @@ -87,6 +89,6 @@ if __name__ == "__main__": main() ~~~ -通过网页直接在线转: +## 通过网页使用在线PDF2MD服务: -在线网页地址:https://doc2x.noedgeai.com \ No newline at end of file +doc2x在线服务地址:https://doc2x.noedgeai.com From 9d0ca5bc1fe2c9e3e7916b92d2a62fef0b4e0235 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Fri, 3 May 2024 00:27:43 +0900 Subject: [PATCH 03/11] =?UTF-8?q?Update=20pipeline.py=20=E4=BF=AE=E6=94=B9?= =?UTF-8?q?import=20=E6=96=87=E4=BB=B6=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rag/src/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rag/src/pipeline.py b/rag/src/pipeline.py index 08b9b96..52e0012 100644 --- a/rag/src/pipeline.py +++ b/rag/src/pipeline.py @@ -2,8 +2,8 @@ from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import PromptTemplate from transformers.utils import logging -from rag.src.data_processing import Data_process -from rag.src.config.config import prompt_template +from data_processing import Data_process +from config.config import prompt_template logger = logging.get_logger(__name__) From 92782a13ea388df92262b8685e2f56d72b1aac96 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Fri, 3 May 2024 00:29:05 +0900 Subject: [PATCH 04/11] Update data_processing.py --- rag/src/data_processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py index 6c4103b..ca6174b 100644 --- a/rag/src/data_processing.py +++ b/rag/src/data_processing.py @@ -4,7 +4,7 @@ import os from loguru import logger from langchain_community.vectorstores import FAISS -from rag.src.config.config import ( +from config.config import ( embedding_path, embedding_model_name, doc_dir, qa_dir, @@ -246,4 +246,4 @@ if __name__ == "__main__": logger.info("After reranking...") for i in range(len(scores)): logger.info(str(scores[i]) + '\n') - logger.info(passages[i]) \ No newline at end of file + logger.info(passages[i]) From 194e1d7462aadfb6a00caece817ceac144c70c76 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Fri, 3 May 2024 00:32:05 +0900 Subject: [PATCH 05/11] =?UTF-8?q?Update=20requirements.txt=20=20add=20PyJW?= =?UTF-8?q?T=20=EF=BC=88import=20jwt=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rag/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rag/requirements.txt b/rag/requirements.txt index 0dd7fe6..671e7b3 100644 --- a/rag/requirements.txt +++ b/rag/requirements.txt @@ -9,4 +9,5 @@ langchain_core==0.1.33 langchain_openai==0.0.8 langchain_text_splitters==0.0.1 FlagEmbedding==1.2.8 -unstructured==0.12.6 \ No newline at end of file +unstructured==0.12.6 +PyJWT From e01eea598475c9562f1240125c809ce76c6ead5a Mon Sep 17 00:00:00 2001 From: HongCheng Date: Fri, 3 May 2024 00:33:11 +0900 Subject: [PATCH 06/11] Update requirements.txt add faiss-cpu --- rag/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/rag/requirements.txt b/rag/requirements.txt index 671e7b3..6bac0a7 100644 --- a/rag/requirements.txt +++ b/rag/requirements.txt @@ -11,3 +11,4 @@ langchain_text_splitters==0.0.1 FlagEmbedding==1.2.8 unstructured==0.12.6 PyJWT +faiss-gpu # faiss-cpu for device without gpu From 93a7a8c25d000fcc1f8d2d87c2da5878ebf81ea1 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Fri, 3 May 2024 00:48:46 +0900 Subject: [PATCH 07/11] Update data_processing.py format --- rag/src/data_processing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py index ca6174b..1939d6f 100644 --- a/rag/src/data_processing.py +++ b/rag/src/data_processing.py @@ -145,9 +145,9 @@ class Data_process(): split_docs = [] logger.info(f'Loading txt files from {data_path}') if os.path.isdir(data_path): - loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True) - docs = loader.load() - split_docs = text_spliter.split_documents(docs) + loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True) + docs = loader.load() + split_docs = text_spliter.split_documents(docs) elif data_path.endswith('.txt'): file_path = data_path logger.info(f'splitting file {file_path}') From e4edbef1830718b7080123b3f15e2d861b2110f7 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Fri, 3 May 2024 13:08:34 +0900 Subject: [PATCH 08/11] Update README.md --- rag/README.md | 89 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 77 insertions(+), 12 deletions(-) diff --git a/rag/README.md b/rag/README.md index 879e523..cf8fa53 100644 --- a/rag/README.md +++ b/rag/README.md @@ -19,6 +19,8 @@ langchain_openai==0.0.8 langchain_text_splitters==0.0.1 FlagEmbedding==1.2.8 unstructured==0.12.6 +PyJWT +faiss-gpu # faiss-cpu for device without gpu ``` ```python @@ -32,10 +34,18 @@ pip3 install -r requirements.txt ### 准备数据 -- txt数据:放入到 src.data.txt 目录下 -- json 数据:放入到 src.data.json 目录下 +#### 搭建自己的 Vector DB + +##### TXT 数据 + +将需要构建的知识库转化为 Txt 文件放入到 src.data.txt 目录下 + +##### JSON 数据 + +构建 QA 对并生成 JSON 文件(多轮对话),放入到 src.data.json 目录下 + +数据格式如下 -JSON 数据格式如下 ```python [ { @@ -53,18 +63,23 @@ JSON 数据格式如下 ] ``` -会根据准备的数据构建vector DB,最终会在 data 文件夹下产生名为 vector_db 的文件夹包含 index.faiss 和 index.pkl +会根据准备的数据构建 vector DB,最终会在 data 文件夹下产生名为 vector_db 的文件夹包含 index.faiss 和 index.pkl。如果已经有 vector DB 则会直接加载对应数据库 -如果已经有 vector DB 则会直接加载对应数据库 +- 可以直接从 xlab 下载对应 DB(请在rag文件目录下执行对应 code) - -**注意**: 可以直接从 xlab 下载对应 DB(请在rag文件目录下执行对应 code) ```python # https://openxlab.org.cn/models/detail/Anooyman/EmoLLMRAGTXT/tree/main git lfs install git clone https://code.openxlab.org.cn/Anooyman/EmoLLMRAGTXT.git ``` +- 也可以从魔塔社区下载对应数据集 + +```python +# https://www.modelscope.cn/datasets/Anooyman/EmoLLMRAGTXT/summary +git clone https://www.modelscope.cn/datasets/Anooyman/EmoLLMRAGTXT.git +``` + ### 配置 config 文件 @@ -106,7 +121,50 @@ prompt_template = """ """ ``` -### 调用 +### 本地调用 + +*注意* +由于 RAG code 已经集成到 `web_internlm2.py` 中,import 路径不再适用于本地调用 +因此需要如下调整对应 import 路径 + +- src/data_processing.py +```python +#from rag.src.config.config import ( +# embedding_path, +# embedding_model_name, +# doc_dir, qa_dir, +# knowledge_pkl_path, +# data_dir, +# vector_db_dir, +# rerank_path, +# rerank_model_name, +# chunk_size, +# chunk_overlap +#) +from config.config import ( + embedding_path, + embedding_model_name, + doc_dir, qa_dir, + knowledge_pkl_path, + data_dir, + vector_db_dir, + rerank_path, + rerank_model_name, + chunk_size, + chunk_overlap +) +``` + +- src/pipeline.py +```python +#from rag.src.data_processing import Data_process +#from rag.src.config.config import prompt_template + +from data_processing import Data_process +from config.config import prompt_template +``` + +修改 import 路径之后通过以下 code 执行 ```python cd rag/src @@ -128,6 +186,13 @@ python main.py ## **相关组件** +这里我们提供了BGE和BCEmbedding两种组合方式,更加推荐性能更加优异的BGE + +### [BGE Github](https://github.com/FlagOpen/FlagEmbedding) + +- [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5): embedding 模型,用于构建 vector DB +- [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large): rerank 模型,用于对检索回来的文章段落重排 + ### [BCEmbedding](https://github.com/netease-youdao/BCEmbedding?tab=readme-ov-file) - [bce-embedding-base_v1](https://hf-mirror.com/maidalun1020/bce-embedding-base_v1): embedding 模型,用于构建 vector DB @@ -157,13 +222,13 @@ RAG的经典评估框架,通过以下三个方面进行评估: ### RAG具体流程 -- 根据数据集构建vector DB -- 对用户输入的问题进行embedding -- 基于embedding结果在向量数据库中进行检索 +- 根据数据集构建 vector DB +- 对用户输入的问题进行 embedding +- 基于 embedding 结果在向量数据库中进行检索 - 对召回数据重排序 - 依据用户问题和召回数据生成最后的结果 -**Noted**: 当用户选择使用RAG时才会进行上述流程 +**Note**: 当用户选择使用RAG时才会进行上述流程 ### 后续增强 From 50c6f9bf3cbfa643d326de93da02a37b45452da7 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Fri, 3 May 2024 13:11:43 +0900 Subject: [PATCH 09/11] Update README_EN.md --- rag/README_EN.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rag/README_EN.md b/rag/README_EN.md index df4fe43..a3d3386 100644 --- a/rag/README_EN.md +++ b/rag/README_EN.md @@ -23,6 +23,13 @@ For details on data collection construction, please refer to [qa_generation_READ ## **Components** +There are two sets of embedding and rerank solutions, i.e., the BGE and BCE, we recommend to use the more powerful **BGE** ! + +### [BGE Github](https://github.com/FlagOpen/FlagEmbedding) + +- [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5): embedding model, used to build vector DB +- [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large): rerank model, used to rerank retrieved documents + ### [BCEmbedding](https://github.com/netease-youdao/BCEmbedding?tab=readme-ov-file) - [bce-embedding-base_v1](https://hf-mirror.com/maidalun1020/bce-embedding-base_v1): embedding model, used to build vector DB @@ -63,4 +70,4 @@ Later, more evaluation indicators were added, such as: context recall, etc. - Add RAGAS evaluation results to the generation process. For example, when the generated results cannot solve the user's problem, it needs to be regenerated. - Add web retrieval to deal with the problem that the corresponding information cannot be retrieved in vector DB -- Add multi-channel retrieval to increase recall rate. That is, multiple similar queries are generated based on user input for retrieval. \ No newline at end of file +- Add multi-channel retrieval to increase recall rate. That is, multiple similar queries are generated based on user input for retrieval. From 63e32019f48f6d0057fa9db012331694fb295db4 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 4 May 2024 12:03:34 +0900 Subject: [PATCH 10/11] Update data_processing.py add rag.src. --- rag/src/data_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py index 1939d6f..968012e 100644 --- a/rag/src/data_processing.py +++ b/rag/src/data_processing.py @@ -4,7 +4,7 @@ import os from loguru import logger from langchain_community.vectorstores import FAISS -from config.config import ( +from rag.src.config.config import ( embedding_path, embedding_model_name, doc_dir, qa_dir, From ff1cd02812498199c4587db16086ef7bd81be79c Mon Sep 17 00:00:00 2001 From: HongCheng Date: Sat, 4 May 2024 12:04:13 +0900 Subject: [PATCH 11/11] Update pipeline.py add rag.src. --- rag/src/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rag/src/pipeline.py b/rag/src/pipeline.py index 52e0012..08b9b96 100644 --- a/rag/src/pipeline.py +++ b/rag/src/pipeline.py @@ -2,8 +2,8 @@ from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import PromptTemplate from transformers.utils import logging -from data_processing import Data_process -from config.config import prompt_template +from rag.src.data_processing import Data_process +from rag.src.config.config import prompt_template logger = logging.get_logger(__name__)