Update README.md

This commit is contained in:
MING_X 2024-05-04 18:08:54 +08:00
parent 0aa6079770
commit dac4b20192
3 changed files with 9 additions and 83 deletions

View File

@ -24,6 +24,8 @@
| *General* | multi_turn_dataset_2 | Conversation | 27,000+ |
| *General* | single_turn_dataset_1 | QA | 14,000+ |
| *General* | single_turn_dataset_2 | QA | 18,300+ |
| *General* | self_cognition_EmoLLM | QA | 85+ |
| *General* | ruozhiba_raw | QA | 240+ |
| *Role-play* | aiwei | Conversation | 4000+ |
| *Role-play* | SoulStar | QA | 11,200+ |
| *Role-play* | tiangou | Conversation | 3900+ |
@ -41,6 +43,8 @@
* 数据集 `multi_turn_dataset_2` 来源 [CPsyCounD](https://github.com/CAS-SIAT-XinHai/CPsyCoun)
* 数据集 `single_turn_dataset_1` 来自本项目
* 数据集 `single_turn_dataset_2` 来自本项目
* 数据集 `self_cognition_EmoLLM` 来自本项目
* 数据集 `ruozhiba_raw` 来源[COIG-CQIA](https://huggingface.co/datasets/m-a-p/COIG-CQIA/viewer/ruozhiba)
### **Role-play**

View File

@ -22,6 +22,8 @@
| *General* | multi_turn_dataset_2 | Conversation | 27,000+ |
| *General* | single_turn_dataset_1 | QA | 14,000+ |
| *General* | single_turn_dataset_2 | QA | 18,300+ |
| *General* | self_cognition_EmoLLM | QA | 85+ |
| *General* | ruozhiba_raw | QA | 240+ |
| *Role-play* | aiwei | Conversation | 4000+ |
| *Role-play* | SoulStar | QA | 11,200+ |
| *Role-play* | tiangou | Conversation | 3900+ |
@ -38,6 +40,8 @@
* dataset `multi_turn_dataset_2` from [CPsyCounD](https://github.com/CAS-SIAT-XinHai/CPsyCoun)
* dataset `single_turn_dataset_1` from this repo
* dataset `single_turn_dataset_2` from this repo
* dataset `self_cognition_EmoLLM` from this repo
* dataset `ruozhiba_raw` from [COIG-CQIA](https://huggingface.co/datasets/m-a-p/COIG-CQIA/viewer/ruozhiba)
**Role-play**
* dataset `aiwei` from this repo

View File

@ -4,90 +4,8 @@
通过使用doc2x的库实现将pdf文件转换为结构化md文档。
通过代码调用(需要提供api_key)
通过代码调用(需要提供api_key),详见代码`pdf2md.py`
~~~python
import requests as rq
import json
import os
import zipfile
class PDF2MD:
def __init__(self, api_key):
self.api_key = api_key
self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
self.export_url = "https://api.doc2x.noedgeai.com/api/export"
def convert(self, filepath, to="md"):
filename = os.path.splitext(os.path.basename(filepath))[0]
res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True)
if res.status_code == 200:
txt_path = filename + ".txt"
with open(txt_path, "w", encoding="utf-8") as f:
for line in res.iter_lines():
if len(line) > 0:
decoded_line = line.decode("utf-8")
f.write(decoded_line + "\n")
print(decoded_line)
uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
print(uuid)
if to == "md" or to == 'latex':
path = filename + '.zip'
elif to == 'docx':
path = filename + '.docx'
export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
if res.status_code == 200:
with open(path, "wb") as f:
f.write(res.content)
print("下载成功,存入:", path)
if to == "md" or to == 'latex':
zip_file = zipfile.ZipFile(path)
# 创建以原始文件名命名的文件夹
if not os.path.exists(filename):
os.mkdir(filename)
# 解压到该文件夹内
for names in zip_file.namelist():
zip_file.extract(names, filename)
zip_file.close()
# 找到解压后的md文件
for file in os.listdir(filename):
if file.endswith(".md"):
extracted_md = os.path.join(filename, file)
break
# 重命名md文件
new_md_name = os.path.join(filename, filename+'.md')
os.rename(extracted_md, new_md_name)
print("解压并重命名md文件为:", new_md_name)
else:
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
else:
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
def main():
api_key = "sk-xxx"
filepath = r"test.pdf"
converter = PDF2MD(api_key)
converter.convert(filepath, to="md")
if __name__ == "__main__":
main()
~~~
## 通过网页使用在线PDF2MD服务