From 975783ecfb740be49c538a6093229575f87c92cc Mon Sep 17 00:00:00 2001
From: wql <626394316@qq.com>
Date: Sat, 27 Apr 2024 22:35:32 +0800
Subject: [PATCH 01/11] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E4=BA=86pdf2md?=
 =?UTF-8?q?=E7=9A=84=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 rag/pdf2md/README.md | 92 ++++++++++++++++++++++++++++++++++++++++++++
 rag/pdf2md/pdf2md.py | 81 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 173 insertions(+)
 create mode 100644 rag/pdf2md/README.md
 create mode 100644 rag/pdf2md/pdf2md.py

diff --git a/rag/pdf2md/README.md b/rag/pdf2md/README.md
new file mode 100644
index 0000000..3cd74c3
--- /dev/null
+++ b/rag/pdf2md/README.md
@@ -0,0 +1,92 @@
+## RAG pdf2md
+
+通过使用doc2x的库，实现将pdf文件转换为结构化md文档。
+
+通过代码调用(需要提供api_key)：
+
+~~~
+import requests as rq
+import json
+import os
+import zipfile
+
+class PDF2MD:
+    def __init__(self, api_key):
+        self.api_key = api_key
+        self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
+        self.export_url = "https://api.doc2x.noedgeai.com/api/export"
+
+    def convert(self, filepath, to="md"):
+        filename = os.path.splitext(os.path.basename(filepath))[0]
+        
+        res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True)
+
+        if res.status_code == 200:
+            txt_path = filename + ".txt"
+            with open(txt_path, "w", encoding="utf-8") as f:
+                for line in res.iter_lines():
+                    if len(line) > 0:
+                        decoded_line = line.decode("utf-8")
+                        f.write(decoded_line + "\n")
+                        print(decoded_line)
+            
+            uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
+            print(uuid)
+            
+            if to == "md" or to == 'latex':
+                path = filename + '.zip'
+            elif to == 'docx':
+                path = filename + '.docx'
+            
+            export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
+            res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
+            
+            if res.status_code == 200:
+                with open(path, "wb") as f:
+                    f.write(res.content)
+                print("下载成功,存入:", path)
+                
+                if to == "md" or to == 'latex':
+                    zip_file = zipfile.ZipFile(path)
+                    
+                    # 创建以原始文件名命名的文件夹
+                    if not os.path.exists(filename):
+                        os.mkdir(filename)
+                    
+                    # 解压到该文件夹内
+                    for names in zip_file.namelist():
+                        zip_file.extract(names, filename)
+                    zip_file.close()
+                    
+                    # 找到解压后的md文件
+                    for file in os.listdir(filename):
+                        if file.endswith(".md"):
+                            extracted_md = os.path.join(filename, file)
+                            break
+                    
+                    # 重命名md文件
+                    new_md_name = os.path.join(filename, filename+'.md')
+                    os.rename(extracted_md, new_md_name)
+                    print("解压并重命名md文件为:", new_md_name)
+                
+            else:
+                print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+        else:
+            print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+
+
+
+def main():
+    api_key = "sk-xxx"
+    filepath = r"test.pdf"
+    converter = PDF2MD(api_key)
+    converter.convert(filepath, to="md")
+
+
+if __name__ == "__main__":
+    main()
+~~~
+
+通过网页直接在线转：
+
+在线网页地址：https://doc2x.noedgeai.com
\ No newline at end of file
diff --git a/rag/pdf2md/pdf2md.py b/rag/pdf2md/pdf2md.py
new file mode 100644
index 0000000..0e39a39
--- /dev/null
+++ b/rag/pdf2md/pdf2md.py
@@ -0,0 +1,81 @@
+import requests as rq
+import json
+import os
+import zipfile
+
+
+class PDF2MD:
+    def __init__(self, api_key):
+        self.api_key = api_key
+        self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
+        self.export_url = "https://api.doc2x.noedgeai.com/api/export"
+
+    def convert(self, filepath, to="md"):
+        filename = os.path.splitext(os.path.basename(filepath))[0]
+
+        res = rq.post(self.url, files={"file": open(filepath, "rb")},
+                      headers={"Authorization": "Bearer " + self.api_key}, stream=True)
+
+        if res.status_code == 200:
+            txt_path = filename + ".txt"
+            with open(txt_path, "w", encoding="utf-8") as f:
+                for line in res.iter_lines():
+                    if len(line) > 0:
+                        decoded_line = line.decode("utf-8")
+                        f.write(decoded_line + "\n")
+                        print(decoded_line)
+
+            uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
+            print(uuid)
+
+            if to == "md" or to == 'latex':
+                path = filename + '.zip'
+            elif to == 'docx':
+                path = filename + '.docx'
+
+            export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
+            res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
+
+            if res.status_code == 200:
+                with open(path, "wb") as f:
+                    f.write(res.content)
+                print("下载成功,存入:", path)
+
+                if to == "md" or to == 'latex':
+                    zip_file = zipfile.ZipFile(path)
+
+                    # 创建以原始文件名命名的文件夹
+                    if not os.path.exists(filename):
+                        os.mkdir(filename)
+
+                    # 解压到该文件夹内
+                    for names in zip_file.namelist():
+                        zip_file.extract(names, filename)
+                    zip_file.close()
+
+                    # 找到解压后的md文件
+                    for file in os.listdir(filename):
+                        if file.endswith(".md"):
+                            extracted_md = os.path.join(filename, file)
+                            break
+
+                    # 重命名md文件
+                    new_md_name = os.path.join(filename, filename + '.md')
+                    os.rename(extracted_md, new_md_name)
+                    print("解压并重命名md文件为:", new_md_name)
+
+            else:
+                print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+        else:
+            print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+
+
+def main():
+    api_key = "sk-xxx"
+    filepath = r"test.pdf"
+    converter = PDF2MD(api_key)
+    converter.convert(filepath, to="md")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 7996422c4d9d1716e2b71cdc7048767900413be3 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Mon, 29 Apr 2024 11:28:50 +0900
Subject: [PATCH 02/11] Update README.md (title and format)

---
 rag/pdf2md/README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/rag/pdf2md/README.md b/rag/pdf2md/README.md
index 3cd74c3..6f669b3 100644
--- a/rag/pdf2md/README.md
+++ b/rag/pdf2md/README.md
@@ -1,10 +1,12 @@
-## RAG pdf2md
+# PDF2MD for RAG
+
+## 使用api_key使用PDF2MD
 
 通过使用doc2x的库，实现将pdf文件转换为结构化md文档。
 
 通过代码调用(需要提供api_key)：
 
-~~~
+~~~python
 import requests as rq
 import json
 import os
@@ -87,6 +89,6 @@ if __name__ == "__main__":
     main()
 ~~~
 
-通过网页直接在线转：
+## 通过网页使用在线PDF2MD服务：
 
-在线网页地址：https://doc2x.noedgeai.com
\ No newline at end of file
+doc2x在线服务地址：https://doc2x.noedgeai.com

From 9d0ca5bc1fe2c9e3e7916b92d2a62fef0b4e0235 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Fri, 3 May 2024 00:27:43 +0900
Subject: [PATCH 03/11] =?UTF-8?q?Update=20pipeline.py=20=E4=BF=AE=E6=94=B9?=
 =?UTF-8?q?import=20=E6=96=87=E4=BB=B6=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 rag/src/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rag/src/pipeline.py b/rag/src/pipeline.py
index 08b9b96..52e0012 100644
--- a/rag/src/pipeline.py
+++ b/rag/src/pipeline.py
@@ -2,8 +2,8 @@ from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import PromptTemplate
 from transformers.utils import logging
 
-from rag.src.data_processing import Data_process
-from rag.src.config.config import prompt_template 
+from data_processing import Data_process
+from config.config import prompt_template 
 logger = logging.get_logger(__name__)
 
 

From 92782a13ea388df92262b8685e2f56d72b1aac96 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Fri, 3 May 2024 00:29:05 +0900
Subject: [PATCH 04/11] Update data_processing.py

---
 rag/src/data_processing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py
index 6c4103b..ca6174b 100644
--- a/rag/src/data_processing.py
+++ b/rag/src/data_processing.py
@@ -4,7 +4,7 @@ import os
 
 from loguru import logger
 from langchain_community.vectorstores import FAISS
-from rag.src.config.config import (
+from config.config import (
     embedding_path,
     embedding_model_name,
     doc_dir, qa_dir,
@@ -246,4 +246,4 @@ if __name__ == "__main__":
     logger.info("After reranking...")
     for i in range(len(scores)):
         logger.info(str(scores[i]) + '\n')
-        logger.info(passages[i])
\ No newline at end of file
+        logger.info(passages[i])

From 194e1d7462aadfb6a00caece817ceac144c70c76 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Fri, 3 May 2024 00:32:05 +0900
Subject: [PATCH 05/11] =?UTF-8?q?Update=20requirements.txt=20=20add=20PyJW?=
 =?UTF-8?q?T=20=EF=BC=88import=20jwt=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 rag/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rag/requirements.txt b/rag/requirements.txt
index 0dd7fe6..671e7b3 100644
--- a/rag/requirements.txt
+++ b/rag/requirements.txt
@@ -9,4 +9,5 @@ langchain_core==0.1.33
 langchain_openai==0.0.8
 langchain_text_splitters==0.0.1
 FlagEmbedding==1.2.8
-unstructured==0.12.6
\ No newline at end of file
+unstructured==0.12.6
+PyJWT

From e01eea598475c9562f1240125c809ce76c6ead5a Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Fri, 3 May 2024 00:33:11 +0900
Subject: [PATCH 06/11] Update requirements.txt add faiss-cpu

---
 rag/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rag/requirements.txt b/rag/requirements.txt
index 671e7b3..6bac0a7 100644
--- a/rag/requirements.txt
+++ b/rag/requirements.txt
@@ -11,3 +11,4 @@ langchain_text_splitters==0.0.1
 FlagEmbedding==1.2.8
 unstructured==0.12.6
 PyJWT
+faiss-gpu  # faiss-cpu for device without gpu

From 93a7a8c25d000fcc1f8d2d87c2da5878ebf81ea1 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Fri, 3 May 2024 00:48:46 +0900
Subject: [PATCH 07/11] Update data_processing.py format

---
 rag/src/data_processing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py
index ca6174b..1939d6f 100644
--- a/rag/src/data_processing.py
+++ b/rag/src/data_processing.py
@@ -145,9 +145,9 @@ class Data_process():
         split_docs = []
         logger.info(f'Loading txt files from {data_path}')
         if os.path.isdir(data_path):
-                loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True)
-                docs = loader.load()
-                split_docs = text_spliter.split_documents(docs)
+            loader = DirectoryLoader(data_path, glob="**/*.txt",show_progress=True)
+            docs = loader.load()
+            split_docs = text_spliter.split_documents(docs)
         elif data_path.endswith('.txt'): 
             file_path = data_path
             logger.info(f'splitting file {file_path}')

From e4edbef1830718b7080123b3f15e2d861b2110f7 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Fri, 3 May 2024 13:08:34 +0900
Subject: [PATCH 08/11] Update README.md

---
 rag/README.md | 89 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 77 insertions(+), 12 deletions(-)

diff --git a/rag/README.md b/rag/README.md
index 879e523..cf8fa53 100644
--- a/rag/README.md
+++ b/rag/README.md
@@ -19,6 +19,8 @@ langchain_openai==0.0.8
 langchain_text_splitters==0.0.1
 FlagEmbedding==1.2.8
 unstructured==0.12.6
+PyJWT
+faiss-gpu  # faiss-cpu for device without gpu
 ```
 
 ```python
@@ -32,10 +34,18 @@ pip3 install -r requirements.txt
 
 ### 准备数据
 
-- txt数据：放入到 src.data.txt 目录下
-- json 数据：放入到 src.data.json 目录下
+#### 搭建自己的 Vector DB
+
+##### TXT 数据
+
+将需要构建的知识库转化为 Txt 文件放入到 src.data.txt 目录下
+
+##### JSON 数据
+
+构建 QA 对并生成 JSON 文件（多轮对话），放入到 src.data.json 目录下
+
+数据格式如下
 
-JSON 数据格式如下
 ```python
 [
     {
@@ -53,18 +63,23 @@ JSON 数据格式如下
 ] 
 ```
 
-会根据准备的数据构建vector DB，最终会在 data 文件夹下产生名为 vector_db 的文件夹包含 index.faiss 和 index.pkl
+会根据准备的数据构建 vector DB，最终会在 data 文件夹下产生名为 vector_db 的文件夹包含 index.faiss 和 index.pkl。如果已经有 vector DB 则会直接加载对应数据库
 
-如果已经有 vector DB 则会直接加载对应数据库
+- 可以直接从 xlab 下载对应 DB（请在rag文件目录下执行对应 code）
 
-
-**注意**: 可以直接从 xlab 下载对应 DB（请在rag文件目录下执行对应 code）
 ```python
 # https://openxlab.org.cn/models/detail/Anooyman/EmoLLMRAGTXT/tree/main
 git lfs install
 git clone https://code.openxlab.org.cn/Anooyman/EmoLLMRAGTXT.git
 ```
 
+- 也可以从魔塔社区下载对应数据集
+  
+```python
+# https://www.modelscope.cn/datasets/Anooyman/EmoLLMRAGTXT/summary
+git clone https://www.modelscope.cn/datasets/Anooyman/EmoLLMRAGTXT.git
+```
+
 
 ### 配置 config 文件
 
@@ -106,7 +121,50 @@ prompt_template = """
 """
 ```
 
-### 调用
+### 本地调用
+
+*注意*
+由于 RAG code 已经集成到 `web_internlm2.py` 中，import 路径不再适用于本地调用
+因此需要如下调整对应 import 路径
+
+- src/data_processing.py
+```python
+#from rag.src.config.config import (
+#    embedding_path,
+#    embedding_model_name,
+#    doc_dir, qa_dir,
+#    knowledge_pkl_path,
+#    data_dir,
+#    vector_db_dir,
+#    rerank_path,
+#    rerank_model_name,
+#    chunk_size,
+#    chunk_overlap
+#)
+from config.config import (
+    embedding_path,
+    embedding_model_name,
+    doc_dir, qa_dir,
+    knowledge_pkl_path,
+    data_dir,
+    vector_db_dir,
+    rerank_path,
+    rerank_model_name,
+    chunk_size,
+    chunk_overlap
+)
+```
+
+- src/pipeline.py
+```python
+#from rag.src.data_processing import Data_process
+#from rag.src.config.config import prompt_template 
+
+from data_processing import Data_process
+from config.config import prompt_template 
+```
+
+修改 import 路径之后通过以下 code 执行
 
 ```python
 cd rag/src
@@ -128,6 +186,13 @@ python main.py
 
 ## **相关组件**
 
+这里我们提供了BGE和BCEmbedding两种组合方式，更加推荐性能更加优异的BGE
+
+### [BGE Github](https://github.com/FlagOpen/FlagEmbedding)
+
+- [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5): embedding 模型，用于构建 vector DB
+- [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large): rerank 模型，用于对检索回来的文章段落重排
+
 ### [BCEmbedding](https://github.com/netease-youdao/BCEmbedding?tab=readme-ov-file)
 
 - [bce-embedding-base_v1](https://hf-mirror.com/maidalun1020/bce-embedding-base_v1): embedding 模型，用于构建 vector DB
@@ -157,13 +222,13 @@ RAG的经典评估框架，通过以下三个方面进行评估:
 
 ### RAG具体流程
 
-- 根据数据集构建vector DB
-- 对用户输入的问题进行embedding
-- 基于embedding结果在向量数据库中进行检索
+- 根据数据集构建 vector DB
+- 对用户输入的问题进行 embedding
+- 基于 embedding 结果在向量数据库中进行检索
 - 对召回数据重排序
 - 依据用户问题和召回数据生成最后的结果
 
-**Noted**: 当用户选择使用RAG时才会进行上述流程
+**Note**: 当用户选择使用RAG时才会进行上述流程
 
 ### 后续增强
 

From 50c6f9bf3cbfa643d326de93da02a37b45452da7 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Fri, 3 May 2024 13:11:43 +0900
Subject: [PATCH 09/11] Update README_EN.md

---
 rag/README_EN.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/rag/README_EN.md b/rag/README_EN.md
index df4fe43..a3d3386 100644
--- a/rag/README_EN.md
+++ b/rag/README_EN.md
@@ -23,6 +23,13 @@ For details on data collection construction, please refer to [qa_generation_READ
 
 ## **Components**
 
+There are two sets of embedding and rerank solutions, i.e., the BGE and BCE, we recommend to use the more powerful **BGE** !
+
+### [BGE Github](https://github.com/FlagOpen/FlagEmbedding)
+
+- [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5): embedding model, used to build vector DB
+- [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large): rerank model, used to rerank retrieved documents 
+
 ### [BCEmbedding](https://github.com/netease-youdao/BCEmbedding?tab=readme-ov-file)
 
 - [bce-embedding-base_v1](https://hf-mirror.com/maidalun1020/bce-embedding-base_v1): embedding model, used to build vector DB
@@ -63,4 +70,4 @@ Later, more evaluation indicators were added, such as: context recall, etc.
 
 - Add RAGAS evaluation results to the generation process. For example, when the generated results cannot solve the user's problem, it needs to be regenerated.
 - Add web retrieval to deal with the problem that the corresponding information cannot be retrieved in vector DB
-- Add multi-channel retrieval to increase recall rate. That is, multiple similar queries are generated based on user input for retrieval.
\ No newline at end of file
+- Add multi-channel retrieval to increase recall rate. That is, multiple similar queries are generated based on user input for retrieval.

From 63e32019f48f6d0057fa9db012331694fb295db4 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Sat, 4 May 2024 12:03:34 +0900
Subject: [PATCH 10/11] Update data_processing.py add rag.src.

---
 rag/src/data_processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py
index 1939d6f..968012e 100644
--- a/rag/src/data_processing.py
+++ b/rag/src/data_processing.py
@@ -4,7 +4,7 @@ import os
 
 from loguru import logger
 from langchain_community.vectorstores import FAISS
-from config.config import (
+from rag.src.config.config import (
     embedding_path,
     embedding_model_name,
     doc_dir, qa_dir,

From ff1cd02812498199c4587db16086ef7bd81be79c Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Sat, 4 May 2024 12:04:13 +0900
Subject: [PATCH 11/11] Update pipeline.py add rag.src.

---
 rag/src/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rag/src/pipeline.py b/rag/src/pipeline.py
index 52e0012..08b9b96 100644
--- a/rag/src/pipeline.py
+++ b/rag/src/pipeline.py
@@ -2,8 +2,8 @@ from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import PromptTemplate
 from transformers.utils import logging
 
-from data_processing import Data_process
-from config.config import prompt_template 
+from rag.src.data_processing import Data_process
+from rag.src.config.config import prompt_template 
 logger = logging.get_logger(__name__)