OliveSensorAPI/rag/pdf2md
2024-04-29 11:28:50 +09:00
..
pdf2md.py 上传了pdf2md的脚本 2024-04-27 22:35:32 +08:00
README.md Update README.md (title and format) 2024-04-29 11:28:50 +09:00

PDF2MD for RAG

使用api_key使用PDF2MD

通过使用doc2x的库实现将pdf文件转换为结构化md文档。

通过代码调用(需要提供api_key)

import requests as rq
import json
import os
import zipfile

class PDF2MD:
    def __init__(self, api_key):
        self.api_key = api_key
        self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
        self.export_url = "https://api.doc2x.noedgeai.com/api/export"

    def convert(self, filepath, to="md"):
        filename = os.path.splitext(os.path.basename(filepath))[0]
        
        res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True)

        if res.status_code == 200:
            txt_path = filename + ".txt"
            with open(txt_path, "w", encoding="utf-8") as f:
                for line in res.iter_lines():
                    if len(line) > 0:
                        decoded_line = line.decode("utf-8")
                        f.write(decoded_line + "\n")
                        print(decoded_line)
            
            uuid = json.loads(decoded_line.replace("data: ", ''))['uuid']
            print(uuid)
            
            if to == "md" or to == 'latex':
                path = filename + '.zip'
            elif to == 'docx':
                path = filename + '.docx'
            
            export_url = self.export_url + "?request_id=" + uuid + "&to=" + to
            res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key})
            
            if res.status_code == 200:
                with open(path, "wb") as f:
                    f.write(res.content)
                print("下载成功,存入:", path)
                
                if to == "md" or to == 'latex':
                    zip_file = zipfile.ZipFile(path)
                    
                    # 创建以原始文件名命名的文件夹
                    if not os.path.exists(filename):
                        os.mkdir(filename)
                    
                    # 解压到该文件夹内
                    for names in zip_file.namelist():
                        zip_file.extract(names, filename)
                    zip_file.close()
                    
                    # 找到解压后的md文件
                    for file in os.listdir(filename):
                        if file.endswith(".md"):
                            extracted_md = os.path.join(filename, file)
                            break
                    
                    # 重命名md文件
                    new_md_name = os.path.join(filename, filename+'.md')
                    os.rename(extracted_md, new_md_name)
                    print("解压并重命名md文件为:", new_md_name)
                
            else:
                print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
        else:
            print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))



def main():
    api_key = "sk-xxx"
    filepath = r"test.pdf"
    converter = PDF2MD(api_key)
    converter.convert(filepath, to="md")


if __name__ == "__main__":
    main()

通过网页使用在线PDF2MD服务

doc2x在线服务地址https://doc2x.noedgeai.com