diff --git a/datasets/README.md b/datasets/README.md index eebb02b..f9390f4 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -24,6 +24,8 @@ | *General* | multi_turn_dataset_2 | Conversation | 27,000+ | | *General* | single_turn_dataset_1 | QA | 14,000+ | | *General* | single_turn_dataset_2 | QA | 18,300+ | +| *General* | self_cognition_EmoLLM | QA | 85+ | +| *General* | ruozhiba_raw | QA | 240+ | | *Role-play* | aiwei | Conversation | 4000+ | | *Role-play* | SoulStar | QA | 11,200+ | | *Role-play* | tiangou | Conversation | 3900+ | @@ -41,6 +43,8 @@ * 数据集 `multi_turn_dataset_2` 来源 [CPsyCounD](https://github.com/CAS-SIAT-XinHai/CPsyCoun) * 数据集 `single_turn_dataset_1` 来自本项目 * 数据集 `single_turn_dataset_2` 来自本项目 +* 数据集 `self_cognition_EmoLLM` 来自本项目 +* 数据集 `ruozhiba_raw` 来源[COIG-CQIA](https://huggingface.co/datasets/m-a-p/COIG-CQIA/viewer/ruozhiba) ### **Role-play** diff --git a/datasets/README_EN.md b/datasets/README_EN.md index d683561..7180468 100644 --- a/datasets/README_EN.md +++ b/datasets/README_EN.md @@ -22,6 +22,8 @@ | *General* | multi_turn_dataset_2 | Conversation | 27,000+ | | *General* | single_turn_dataset_1 | QA | 14,000+ | | *General* | single_turn_dataset_2 | QA | 18,300+ | +| *General* | self_cognition_EmoLLM | QA | 85+ | +| *General* | ruozhiba_raw | QA | 240+ | | *Role-play* | aiwei | Conversation | 4000+ | | *Role-play* | SoulStar | QA | 11,200+ | | *Role-play* | tiangou | Conversation | 3900+ | @@ -38,6 +40,8 @@ * dataset `multi_turn_dataset_2` from [CPsyCounD](https://github.com/CAS-SIAT-XinHai/CPsyCoun) * dataset `single_turn_dataset_1` from this repo * dataset `single_turn_dataset_2` from this repo +* dataset `self_cognition_EmoLLM` from this repo +* dataset `ruozhiba_raw` from [COIG-CQIA](https://huggingface.co/datasets/m-a-p/COIG-CQIA/viewer/ruozhiba) **Role-play**: * dataset `aiwei` from this repo diff --git a/rag/pdf2md/README.md b/rag/pdf2md/README.md index 6f669b3..b52bab5 100644 --- a/rag/pdf2md/README.md +++ b/rag/pdf2md/README.md @@ -4,90 +4,8 @@ 通过使用doc2x的库,实现将pdf文件转换为结构化md文档。 -通过代码调用(需要提供api_key): +通过代码调用(需要提供api_key),详见代码`pdf2md.py` -~~~python -import requests as rq -import json -import os -import zipfile - -class PDF2MD: - def __init__(self, api_key): - self.api_key = api_key - self.url = "https://api.doc2x.noedgeai.com/api/v1/pdf" - self.export_url = "https://api.doc2x.noedgeai.com/api/export" - - def convert(self, filepath, to="md"): - filename = os.path.splitext(os.path.basename(filepath))[0] - - res = rq.post(self.url, files={"file": open(filepath, "rb")}, headers={"Authorization": "Bearer " + self.api_key}, stream=True) - - if res.status_code == 200: - txt_path = filename + ".txt" - with open(txt_path, "w", encoding="utf-8") as f: - for line in res.iter_lines(): - if len(line) > 0: - decoded_line = line.decode("utf-8") - f.write(decoded_line + "\n") - print(decoded_line) - - uuid = json.loads(decoded_line.replace("data: ", ''))['uuid'] - print(uuid) - - if to == "md" or to == 'latex': - path = filename + '.zip' - elif to == 'docx': - path = filename + '.docx' - - export_url = self.export_url + "?request_id=" + uuid + "&to=" + to - res = rq.get(export_url, headers={"Authorization": "Bearer " + self.api_key}) - - if res.status_code == 200: - with open(path, "wb") as f: - f.write(res.content) - print("下载成功,存入:", path) - - if to == "md" or to == 'latex': - zip_file = zipfile.ZipFile(path) - - # 创建以原始文件名命名的文件夹 - if not os.path.exists(filename): - os.mkdir(filename) - - # 解压到该文件夹内 - for names in zip_file.namelist(): - zip_file.extract(names, filename) - zip_file.close() - - # 找到解压后的md文件 - for file in os.listdir(filename): - if file.endswith(".md"): - extracted_md = os.path.join(filename, file) - break - - # 重命名md文件 - new_md_name = os.path.join(filename, filename+'.md') - os.rename(extracted_md, new_md_name) - print("解压并重命名md文件为:", new_md_name) - - else: - print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - else: - print(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - - - -def main(): - api_key = "sk-xxx" - filepath = r"test.pdf" - converter = PDF2MD(api_key) - converter.convert(filepath, to="md") - - -if __name__ == "__main__": - main() -~~~ ## 通过网页使用在线PDF2MD服务: