上传了pdf2md的脚本
This commit is contained in:
parent
4e05afb8c1
commit
975783ecfb
92
rag/pdf2md/README.md
Normal file
92
rag/pdf2md/README.md
Normal file
@ -0,0 +1,92 @@
|
||||
## RAG pdf2md
|
||||
|
||||
通过使用doc2x的库,实现将pdf文件转换为结构化md文档。
|
||||
|
||||
通过代码调用(需要提供api_key):
|
||||
|
||||
~~~
|
||||
import requests as rq
|
||||
import json
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
class PDF2MD:
|
||||
def __init__(self, api_key):
|
||||
self.api_key = api_key
|
||||
self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
|
||||
self.export_url = "https://api.doc2x.noedgeai.com/api/export"
|
||||
|
||||
def convert(self, filepath, to="md"):
|
||||
filename = os.path.splitext(os.path.basename(filepath))[0]
|
||||
|
||||
res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True)
|
||||
|
||||
if res.status_code == 200:
|
||||
txt_path = filename + ".txt"
|
||||
with open(txt_path, "w", encoding="utf-8") as f:
|
||||
for line in res.iter_lines():
|
||||
if len(line) > 0:
|
||||
decoded_line = line.decode("utf-8")
|
||||
f.write(decoded_line + "\n")
|
||||
print(decoded_line)
|
||||
|
||||
uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
|
||||
print(uuid)
|
||||
|
||||
if to == "md" or to == 'latex':
|
||||
path = filename + '.zip'
|
||||
elif to == 'docx':
|
||||
path = filename + '.docx'
|
||||
|
||||
export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
|
||||
res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
|
||||
|
||||
if res.status_code == 200:
|
||||
with open(path, "wb") as f:
|
||||
f.write(res.content)
|
||||
print("下载成功,存入:", path)
|
||||
|
||||
if to == "md" or to == 'latex':
|
||||
zip_file = zipfile.ZipFile(path)
|
||||
|
||||
# 创建以原始文件名命名的文件夹
|
||||
if not os.path.exists(filename):
|
||||
os.mkdir(filename)
|
||||
|
||||
# 解压到该文件夹内
|
||||
for names in zip_file.namelist():
|
||||
zip_file.extract(names, filename)
|
||||
zip_file.close()
|
||||
|
||||
# 找到解压后的md文件
|
||||
for file in os.listdir(filename):
|
||||
if file.endswith(".md"):
|
||||
extracted_md = os.path.join(filename, file)
|
||||
break
|
||||
|
||||
# 重命名md文件
|
||||
new_md_name = os.path.join(filename, filename+'.md')
|
||||
os.rename(extracted_md, new_md_name)
|
||||
print("解压并重命名md文件为:", new_md_name)
|
||||
|
||||
else:
|
||||
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||
else:
|
||||
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
api_key = "sk-xxx"
|
||||
filepath = r"test.pdf"
|
||||
converter = PDF2MD(api_key)
|
||||
converter.convert(filepath, to="md")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
~~~
|
||||
|
||||
通过网页直接在线转:
|
||||
|
||||
在线网页地址:https://doc2x.noedgeai.com
|
81
rag/pdf2md/pdf2md.py
Normal file
81
rag/pdf2md/pdf2md.py
Normal file
@ -0,0 +1,81 @@
|
||||
import requests as rq
|
||||
import json
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
|
||||
class PDF2MD:
|
||||
def __init__(self, api_key):
|
||||
self.api_key = api_key
|
||||
self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
|
||||
self.export_url = "https://api.doc2x.noedgeai.com/api/export"
|
||||
|
||||
def convert(self, filepath, to="md"):
|
||||
filename = os.path.splitext(os.path.basename(filepath))[0]
|
||||
|
||||
res = rq.post(self.url, files={"file": open(filepath, "rb")},
|
||||
headers={"Authorization": "Bearer " + self.api_key}, stream=True)
|
||||
|
||||
if res.status_code == 200:
|
||||
txt_path = filename + ".txt"
|
||||
with open(txt_path, "w", encoding="utf-8") as f:
|
||||
for line in res.iter_lines():
|
||||
if len(line) > 0:
|
||||
decoded_line = line.decode("utf-8")
|
||||
f.write(decoded_line + "\n")
|
||||
print(decoded_line)
|
||||
|
||||
uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
|
||||
print(uuid)
|
||||
|
||||
if to == "md" or to == 'latex':
|
||||
path = filename + '.zip'
|
||||
elif to == 'docx':
|
||||
path = filename + '.docx'
|
||||
|
||||
export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
|
||||
res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
|
||||
|
||||
if res.status_code == 200:
|
||||
with open(path, "wb") as f:
|
||||
f.write(res.content)
|
||||
print("下载成功,存入:", path)
|
||||
|
||||
if to == "md" or to == 'latex':
|
||||
zip_file = zipfile.ZipFile(path)
|
||||
|
||||
# 创建以原始文件名命名的文件夹
|
||||
if not os.path.exists(filename):
|
||||
os.mkdir(filename)
|
||||
|
||||
# 解压到该文件夹内
|
||||
for names in zip_file.namelist():
|
||||
zip_file.extract(names, filename)
|
||||
zip_file.close()
|
||||
|
||||
# 找到解压后的md文件
|
||||
for file in os.listdir(filename):
|
||||
if file.endswith(".md"):
|
||||
extracted_md = os.path.join(filename, file)
|
||||
break
|
||||
|
||||
# 重命名md文件
|
||||
new_md_name = os.path.join(filename, filename + '.md')
|
||||
os.rename(extracted_md, new_md_name)
|
||||
print("解压并重命名md文件为:", new_md_name)
|
||||
|
||||
else:
|
||||
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||
else:
|
||||
print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||
|
||||
|
||||
def main():
|
||||
api_key = "sk-xxx"
|
||||
filepath = r"test.pdf"
|
||||
converter = PDF2MD(api_key)
|
||||
converter.convert(filepath, to="md")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user