Add files via upload

This commit is contained in:
এ許我辞忧࿐♡ 2024-03-09 21:30:52 +08:00 committed by GitHub
parent 7891e1aa5f
commit 1912b745a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

62
rag/src/util/text_seg.py Normal file
View File

@ -0,0 +1,62 @@
# 对文本类的非QA对数据做切分 --- 使用qwen的api对书籍进行语义分割
import json
import random
import argparse
import yaml
import re
import copy
from tqdm import tqdm
# config.yml文件由自己定义
with open('config.yml', 'r', encoding='utf-8') as f:
configs = yaml.load(f.read(), Loader=yaml.FullLoader)
def qwen_api(content):
import dashscope
from http import HTTPStatus
Input = '''我们的分割要求是每一个划分占一行请你帮我将下列txt文本按照书本的内容比如事件的背景心理学名词的定义特点阶段划分实验内容等进行划分要求文本内容不能缩减也可以按照语义分割比如某几句话都是讲的一回事就划分一行要求划分之后的文本内容详细主题明确要求每一个划分仅用一行表示。以下为要求分割的txt文本{}
'''.format(content)
dashscope.api_key = configs['dashscope_api_key']
response = dashscope.Generation.call(
model='qwen-max',
prompt=Input,
history=[],
)
if response.status_code == HTTPStatus.OK:
result = response.output.text
print(result)
else:
result = 'ERROR'
return result
def save_jsonl(data_lis, file_path):
import json
# 将字典列表写入文件,每一行一个字典
with open(file_path, 'at', encoding='utf-8') as file:
for item in data_lis:
json_string = json.dumps(item, ensure_ascii=False) + '\n'
file.write(json_string)
if __name__ == '__main__':
file_name = 'a0.jsonl'
conversations = []
path = configs['txt_path']
f = open(path, 'r', encoding='utf-8')
str = f.read()
f.close()
for i in tqdm(range(0, len(str), 2500)):
# 保证所有文本都能按照完整的语义进行分割
content = str[i:i+3500]
print(content)
answer = qwen_api(content)
f2 = open('seg1.txt', 'a', encoding='utf-8')
f2.write(answer)
f2.close()