Add files via upload
This commit is contained in:
parent
7891e1aa5f
commit
1912b745a6
62
rag/src/util/text_seg.py
Normal file
62
rag/src/util/text_seg.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
# 对文本类的(非QA对)数据做切分 --- 使用qwen的api对书籍进行语义分割
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import argparse
|
||||||
|
import yaml
|
||||||
|
import re
|
||||||
|
import copy
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# config.yml文件由自己定义
|
||||||
|
with open('config.yml', 'r', encoding='utf-8') as f:
|
||||||
|
configs = yaml.load(f.read(), Loader=yaml.FullLoader)
|
||||||
|
|
||||||
|
def qwen_api(content):
|
||||||
|
import dashscope
|
||||||
|
from http import HTTPStatus
|
||||||
|
|
||||||
|
Input = '''我们的分割要求是每一个划分占一行,请你帮我将下列txt文本按照书本的内容(比如:事件的背景,心理学名词的定义,特点,阶段划分,实验内容等)进行划分,要求文本内容不能缩减,也可以按照语义分割,比如某几句话都是讲的一回事就划分一行,要求划分之后的文本内容详细,主题明确,要求每一个划分仅用一行表示。以下为要求分割的txt文本:{}
|
||||||
|
'''.format(content)
|
||||||
|
|
||||||
|
dashscope.api_key = configs['dashscope_api_key']
|
||||||
|
response = dashscope.Generation.call(
|
||||||
|
model='qwen-max',
|
||||||
|
prompt=Input,
|
||||||
|
history=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == HTTPStatus.OK:
|
||||||
|
result = response.output.text
|
||||||
|
print(result)
|
||||||
|
else:
|
||||||
|
result = 'ERROR'
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def save_jsonl(data_lis, file_path):
|
||||||
|
import json
|
||||||
|
|
||||||
|
# 将字典列表写入文件,每一行一个字典
|
||||||
|
with open(file_path, 'at', encoding='utf-8') as file:
|
||||||
|
for item in data_lis:
|
||||||
|
json_string = json.dumps(item, ensure_ascii=False) + '\n'
|
||||||
|
file.write(json_string)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
file_name = 'a0.jsonl'
|
||||||
|
conversations = []
|
||||||
|
path = configs['txt_path']
|
||||||
|
f = open(path, 'r', encoding='utf-8')
|
||||||
|
str = f.read()
|
||||||
|
f.close()
|
||||||
|
for i in tqdm(range(0, len(str), 2500)):
|
||||||
|
# 保证所有文本都能按照完整的语义进行分割
|
||||||
|
content = str[i:i+3500]
|
||||||
|
print(content)
|
||||||
|
answer = qwen_api(content)
|
||||||
|
|
||||||
|
f2 = open('seg1.txt', 'a', encoding='utf-8')
|
||||||
|
f2.write(answer)
|
||||||
|
f2.close()
|
Loading…
Reference in New Issue
Block a user