Add files via upload

2024-03-09 21:30:52 +08:00 · 2024-03-09 21:30:52 +08:00 · 1912b745a6
commit 1912b745a6
parent 7891e1aa5f
1 changed files with 62 additions and 0 deletions
--- a/rag/src/util/text_seg.py
+++ b/rag/src/util/text_seg.py
@ -0,0 +1,62 @@
 # 对文本类的（非QA对）数据做切分 --- 使用qwen的api对书籍进行语义分割
 import json
 import random
 import argparse
 import yaml
 import re
 import copy
 from tqdm import tqdm
 # config.yml文件由自己定义
 with open('config.yml', 'r', encoding='utf-8') as f:
    configs = yaml.load(f.read(), Loader=yaml.FullLoader)
 def qwen_api(content):
    import dashscope
    from http import HTTPStatus
    Input = '''我们的分割要求是每一个划分占一行，请你帮我将下列txt文本按照书本的内容（比如：事件的背景，心理学名词的定义，特点，阶段划分，实验内容等）进行划分，要求文本内容不能缩减，也可以按照语义分割，比如某几句话都是讲的一回事就划分一行，要求划分之后的文本内容详细，主题明确，要求每一个划分仅用一行表示。以下为要求分割的txt文本：{}
        '''.format(content)
    dashscope.api_key = configs['dashscope_api_key']
    response = dashscope.Generation.call(
        model='qwen-max',
        prompt=Input,
        history=[],
    )
    if response.status_code == HTTPStatus.OK:
        result = response.output.text
        print(result)
    else:
        result = 'ERROR'
    return result
 def save_jsonl(data_lis, file_path):
    import json
    # 将字典列表写入文件，每一行一个字典
    with open(file_path, 'at', encoding='utf-8') as file:
        for item in data_lis:
            json_string = json.dumps(item, ensure_ascii=False) + '\n'
            file.write(json_string)
 if __name__ == '__main__':
    file_name = 'a0.jsonl'
    conversations = []
    path = configs['txt_path']
    f = open(path, 'r', encoding='utf-8')
    str = f.read()
    f.close()
    for i in tqdm(range(0, len(str), 2500)):
        # 保证所有文本都能按照完整的语义进行分割
        content = str[i:i+3500]
        print(content)
        answer = qwen_api(content)
        f2 = open('seg1.txt', 'a', encoding='utf-8')
        f2.write(answer)
        f2.close()