OliveSensorAPI/rag/pdf2md/README.md
2024-04-29 11:28:50 +09:00

95 lines
3.2 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# PDF2MD for RAG
## 使用api_key使用PDF2MD
通过使用doc2x的库实现将pdf文件转换为结构化md文档。
通过代码调用(需要提供api_key)
~~~python
import requests as rq
import json
import os
import zipfile
class PDF2MD:
def __init__(self, api_key):
self.api_key = api_key
self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
self.export_url = "https://api.doc2x.noedgeai.com/api/export"
def convert(self, filepath, to="md"):
filename = os.path.splitext(os.path.basename(filepath))[0]
res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True)
if res.status_code == 200:
txt_path = filename + ".txt"
with open(txt_path, "w", encoding="utf-8") as f:
for line in res.iter_lines():
if len(line) > 0:
decoded_line = line.decode("utf-8")
f.write(decoded_line + "\n")
print(decoded_line)
uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
print(uuid)
if to == "md" or to == 'latex':
path = filename + '.zip'
elif to == 'docx':
path = filename + '.docx'
export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
if res.status_code == 200:
with open(path, "wb") as f:
f.write(res.content)
print("下载成功,存入:", path)
if to == "md" or to == 'latex':
zip_file = zipfile.ZipFile(path)
# 创建以原始文件名命名的文件夹
if not os.path.exists(filename):
os.mkdir(filename)
# 解压到该文件夹内
for names in zip_file.namelist():
zip_file.extract(names, filename)
zip_file.close()
# 找到解压后的md文件
for file in os.listdir(filename):
if file.endswith(".md"):
extracted_md = os.path.join(filename, file)
break
# 重命名md文件
new_md_name = os.path.join(filename, filename+'.md')
os.rename(extracted_md, new_md_name)
print("解压并重命名md文件为:", new_md_name)
else:
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
else:
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
def main():
api_key = "sk-xxx"
filepath = r"test.pdf"
converter = PDF2MD(api_key)
converter.convert(filepath, to="md")
if __name__ == "__main__":
main()
~~~
## 通过网页使用在线PDF2MD服务
doc2x在线服务地址https://doc2x.noedgeai.com