From 975783ecfb740be49c538a6093229575f87c92cc Mon Sep 17 00:00:00 2001 From: wql <626394316@qq.com> Date: Sat, 27 Apr 2024 22:35:32 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E4=BA=86pdf2md=E7=9A=84?= =?UTF-8?q?=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rag/pdf2md/README.md | 92 ++++++++++++++++++++++++++++++++++++++++++++ rag/pdf2md/pdf2md.py | 81 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 rag/pdf2md/README.md create mode 100644 rag/pdf2md/pdf2md.py diff --git a/rag/pdf2md/README.md b/rag/pdf2md/README.md new file mode 100644 index 0000000..3cd74c3 --- /dev/null +++ b/rag/pdf2md/README.md @@ -0,0 +1,92 @@ +## RAG pdf2md + +通过使用doc2x的库,实现将pdf文件转换为结构化md文档。 + +通过代码调用(需要提供api_key): + +~~~ +import requests as rq +import json +import os +import zipfile + +class PDF2MD: + def __init__(self, api_key): + self.api_key = api_key + self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf" + self.export_url = "https://api.doc2x.noedgeai.com/api/export" + + def convert(self, filepath, to="md"): + filename = os.path.splitext(os.path.basename(filepath))[0] + + res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True) + + if res.status_code == 200: + txt_path = filename + ".txt" + with open(txt_path, "w", encoding="utf-8") as f: + for line in res.iter_lines(): + if len(line) > 0: + decoded_line = line.decode("utf-8") + f.write(decoded_line + "\n") + print(decoded_line) + + uuid = json.loads(decoded_line.replace("data: ", ''))['uuid'] + print(uuid) + + if to == "md" or to == 'latex': + path = filename + '.zip' + elif to == 'docx': + path = filename + '.docx' + + export_url = self.export_url + "?request_id=" + uuid + "&to=" + to + res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key}) + + if res.status_code == 200: + with open(path, "wb") as f: + f.write(res.content) + print("下载成功,存入:", path) + + if to == "md" or to == 'latex': + zip_file = zipfile.ZipFile(path) + + # 创建以原始文件名命名的文件夹 + if not os.path.exists(filename): + os.mkdir(filename) + + # 解压到该文件夹内 + for names in zip_file.namelist(): + zip_file.extract(names, filename) + zip_file.close() + + # 找到解压后的md文件 + for file in os.listdir(filename): + if file.endswith(".md"): + extracted_md = os.path.join(filename, file) + break + + # 重命名md文件 + new_md_name = os.path.join(filename, filename+'.md') + os.rename(extracted_md, new_md_name) + print("解压并重命名md文件为:", new_md_name) + + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + + + +def main(): + api_key = "sk-xxx" + filepath = r"test.pdf" + converter = PDF2MD(api_key) + converter.convert(filepath, to="md") + + +if __name__ == "__main__": + main() +~~~ + +通过网页直接在线转: + +在线网页地址:https://doc2x.noedgeai.com \ No newline at end of file diff --git a/rag/pdf2md/pdf2md.py b/rag/pdf2md/pdf2md.py new file mode 100644 index 0000000..0e39a39 --- /dev/null +++ b/rag/pdf2md/pdf2md.py @@ -0,0 +1,81 @@ +import requests as rq +import json +import os +import zipfile + + +class PDF2MD: + def __init__(self, api_key): + self.api_key = api_key + self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf" + self.export_url = "https://api.doc2x.noedgeai.com/api/export" + + def convert(self, filepath, to="md"): + filename = os.path.splitext(os.path.basename(filepath))[0] + + res = rq.post(self.url, files={"file": open(filepath, "rb")}, + headers={"Authorization": "Bearer " + self.api_key}, stream=True) + + if res.status_code == 200: + txt_path = filename + ".txt" + with open(txt_path, "w", encoding="utf-8") as f: + for line in res.iter_lines(): + if len(line) > 0: + decoded_line = line.decode("utf-8") + f.write(decoded_line + "\n") + print(decoded_line) + + uuid = json.loads(decoded_line.replace("data: ", ''))['uuid'] + print(uuid) + + if to == "md" or to == 'latex': + path = filename + '.zip' + elif to == 'docx': + path = filename + '.docx' + + export_url = self.export_url + "?request_id=" + uuid + "&to=" + to + res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key}) + + if res.status_code == 200: + with open(path, "wb") as f: + f.write(res.content) + print("下载成功,存入:", path) + + if to == "md" or to == 'latex': + zip_file = zipfile.ZipFile(path) + + # 创建以原始文件名命名的文件夹 + if not os.path.exists(filename): + os.mkdir(filename) + + # 解压到该文件夹内 + for names in zip_file.namelist(): + zip_file.extract(names, filename) + zip_file.close() + + # 找到解压后的md文件 + for file in os.listdir(filename): + if file.endswith(".md"): + extracted_md = os.path.join(filename, file) + break + + # 重命名md文件 + new_md_name = os.path.join(filename, filename + '.md') + os.rename(extracted_md, new_md_name) + print("解压并重命名md文件为:", new_md_name) + + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + else: + print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + + +def main(): + api_key = "sk-xxx" + filepath = r"test.pdf" + converter = PDF2MD(api_key) + converter.convert(filepath, to="md") + + +if __name__ == "__main__": + main() \ No newline at end of file