Update README.md

2024-05-04 18:08:54 +08:00 · 2024-05-04 18:08:54 +08:00 · dac4b20192
commit dac4b20192
parent 0aa6079770
3 changed files with 9 additions and 83 deletions
--- a/datasets/README.md
+++ b/datasets/README.md
@ -24,6 +24,8 @@
 |  *General*  | multi_turn_dataset_2  | Conversation | 27,000+ |
 |  *General*  | single_turn_dataset_1 |      QA      | 14,000+ |
 |  *General*  | single_turn_dataset_2 |      QA      | 18,300+ |
+|  *General*  | self_cognition_EmoLLM |      QA      |   85+   |
+|  *General*  |      ruozhiba_raw     |      QA      |   240+  |
 | *Role-play* |         aiwei         | Conversation |  4000+  |
 | *Role-play* |       SoulStar        |      QA      | 11,200+ |
 | *Role-play* |        tiangou        | Conversation |  3900+  |
@ -41,6 +43,8 @@
 * 数据集 `multi_turn_dataset_2` 来源 [CPsyCounD](https://github.com/CAS-SIAT-XinHai/CPsyCoun)
 * 数据集 `single_turn_dataset_1` 来自本项目
 * 数据集 `single_turn_dataset_2` 来自本项目
+* 数据集 `self_cognition_EmoLLM` 来自本项目
+* 数据集 `ruozhiba_raw` 来源[COIG-CQIA](https://huggingface.co/datasets/m-a-p/COIG-CQIA/viewer/ruozhiba)

 ### **Role-play**

--- a/datasets/README_EN.md
+++ b/datasets/README_EN.md
@ -22,6 +22,8 @@
 |  *General*  | multi_turn_dataset_2  | Conversation | 27,000+ |
 |  *General*  | single_turn_dataset_1 |      QA      | 14,000+ |
 |  *General*  | single_turn_dataset_2 |      QA      | 18,300+ |
+|  *General*  | self_cognition_EmoLLM |      QA      |   85+   |
+|  *General*  |      ruozhiba_raw     |      QA      |   240+  |
 | *Role-play* |         aiwei         | Conversation |  4000+  |
 | *Role-play* |       SoulStar        |      QA      | 11,200+ |
 | *Role-play* |        tiangou        | Conversation |  3900+  |
@ -38,6 +40,8 @@
 * dataset `multi_turn_dataset_2` from [CPsyCounD](https://github.com/CAS-SIAT-XinHai/CPsyCoun)
 * dataset `single_turn_dataset_1` from this repo
 * dataset `single_turn_dataset_2` from this repo
+* dataset `self_cognition_EmoLLM` from this repo
+* dataset `ruozhiba_raw` from [COIG-CQIA](https://huggingface.co/datasets/m-a-p/COIG-CQIA/viewer/ruozhiba)

 **Role-play**：
 * dataset `aiwei` from this repo
--- a/rag/pdf2md/README.md
+++ b/rag/pdf2md/README.md
@ -4,90 +4,8 @@

 通过使用doc2x的库，实现将pdf文件转换为结构化md文档。

-通过代码调用(需要提供api_key)：
+通过代码调用(需要提供api_key)，详见代码`pdf2md.py`

-~~~python
-import requests as rq
-import json
-import os
-import zipfile
-
-class PDF2MD:
-    def __init__(self, api_key):
-        self.api_key = api_key
-        self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
-        self.export_url = "https://api.doc2x.noedgeai.com/api/export"
-
-    def convert(self, filepath, to="md"):
-        filename = os.path.splitext(os.path.basename(filepath))[0]
-        
-        res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True)
-
-        if res.status_code == 200:
-            txt_path = filename + ".txt"
-            with open(txt_path, "w", encoding="utf-8") as f:
-                for line in res.iter_lines():
-                    if len(line) > 0:
-                        decoded_line = line.decode("utf-8")
-                        f.write(decoded_line + "\n")
-                        print(decoded_line)
-            
-            uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
-            print(uuid)
-            
-            if to == "md" or to == 'latex':
-                path = filename + '.zip'
-            elif to == 'docx':
-                path = filename + '.docx'
-            
-            export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
-            res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
-            
-            if res.status_code == 200:
-                with open(path, "wb") as f:
-                    f.write(res.content)
-                print("下载成功,存入:", path)
-                
-                if to == "md" or to == 'latex':
-                    zip_file = zipfile.ZipFile(path)
-                    
-                    # 创建以原始文件名命名的文件夹
-                    if not os.path.exists(filename):
-                        os.mkdir(filename)
-                    
-                    # 解压到该文件夹内
-                    for names in zip_file.namelist():
-                        zip_file.extract(names, filename)
-                    zip_file.close()
-                    
-                    # 找到解压后的md文件
-                    for file in os.listdir(filename):
-                        if file.endswith(".md"):
-                            extracted_md = os.path.join(filename, file)
-                            break
-                    
-                    # 重命名md文件
-                    new_md_name = os.path.join(filename, filename+'.md')
-                    os.rename(extracted_md, new_md_name)
-                    print("解压并重命名md文件为:", new_md_name)
-                
-            else:
-                print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
-        else:
-            print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
-
-
-
-def main():
-    api_key = "sk-xxx"
-    filepath = r"test.pdf"
-    converter = PDF2MD(api_key)
-    converter.convert(filepath, to="md")
-
-
-if __name__ == "__main__":
-    main()
-~~~

 ## 通过网页使用在线PDF2MD服务：