OliveSensorAPI/scripts/qa_generation/util/data_loader.py

149 lines
4.4 KiB
Python
Raw Normal View History

2024-03-07 17:56:07 +08:00
import os
import re
import json
import glob
2024-03-07 17:56:07 +08:00
from typing import List, Dict
2024-03-16 13:12:15 +08:00
from config.config import data_dir, judge_dir
2024-03-07 17:56:07 +08:00
from util.logger import get_logger
logger = get_logger()
2024-03-16 13:12:15 +08:00
"""
递归获取 数据整合 下的所有 .jsonl 文件列表
"""
def get_jsonl_file_paths() -> List[str]:
json_file_paths = []
# 遍历根目录及其所有子目录
for dirpath, dirnames, filenames in os.walk(judge_dir):
# 对每个文件进行检查
for filename in filenames:
# 使用正则表达式匹配以.jsonl结尾的文件名
if re.search(r'\.jsonl$', filename):
# 构建完整的文件路径并添加到列表中
json_file_path = os.path.join(dirpath, filename)
json_file_paths.append(json_file_path)
return json_file_paths
def get_QA_pairs(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
# 按照换行符分割字符串
QA_Pairs = content.split('\n')
return QA_Pairs
2024-03-07 17:56:07 +08:00
"""
递归获取 data_dir 下的所有 .txt 文件列表
"""
def get_file_list() -> List[str]:
txt_files = []
txt_exist_flag = False
for root, dirs, files in os.walk(data_dir):
for file in files:
if file.endswith('.txt'):
txt_exist_flag = True
txt_files.append(os.path.join(root, file))
if not txt_exist_flag:
logger.warning(f'No txt text found in {data_dir}, please check!')
return txt_files
"""
获取 txt 文本的所有内容按句子返回 List
file_path: txt 文本路径
window_size: 滑窗大小单位为句子数
overlap_size: 重叠大小单位为句子数
"""
def get_txt_content(
file_path: str,
window_size: int = 6,
overlap_size: int = 2
) -> List[str]:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
# 简单实现:按句号、感叹号、问号分割,并去除句内空白符
sentences = re.split(r'(?<=[。!?])\s+', content)
sentences = [s.replace(' ', '').replace('\t', '') for s in sentences]
# 滑窗
res = []
sentences_amount = len(sentences)
start_index, end_index = 0, sentences_amount - window_size
2024-03-16 13:12:15 +08:00
# check length
2024-03-07 17:56:07 +08:00
if window_size < overlap_size:
logger.error("window_size must be greater than or equal to overlap_size")
return None
if window_size >= sentences_amount:
logger.warning("window_size exceeds the amount of sentences, and the complete text content will be returned")
return ['\n'.join(sentences)]
for i in range(start_index, end_index + 1, overlap_size):
2024-03-16 13:12:15 +08:00
res.append('\n'.join(sentences[i: i + window_size]))
2024-03-07 17:56:07 +08:00
return res
"""
提取返回的 QA
"""
def capture_qa(content: str) -> List[Dict]:
# 只捕获第一个 json 块
match = re.search(r'```json(.*?)```', content, re.DOTALL)
if match:
2024-03-08 19:00:12 +08:00
parsed_data = None
2024-03-07 17:56:07 +08:00
block = match.group(1)
2024-03-08 19:00:12 +08:00
try:
parsed_data = json.loads(block)
except:
logger.warning('Unable to parse JSON properly.')
finally:
return parsed_data
2024-03-07 17:56:07 +08:00
else:
logger.warning("No JSON block found.")
return None
2024-03-16 13:12:15 +08:00
"""
storage_list 存入到 storage_jsonl_path
"""
def save_to_file(storage_jsonl_path, storage_list):
with open(storage_jsonl_path, 'a', encoding='utf-8') as f:
for item in storage_list:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
Update code (#8) * feat: add agents/actions/write_markdown * [ADD] add evaluation result of base model on 5/10 epochs * Rename mother.json to mother_v1_2439.json * Add files via upload * [DOC] update README * Update requirements.txt update mpi4py installation * Update README_EN.md update English comma * Update README.md 基于母亲角色的多轮对话模型微调完毕。已上传到 Huggingface。 * 多轮对话母亲角色的微调的脚本 * Update README.md 加上了王几行XING 和 思在 的作者信息 * Update README_EN.md * Update README.md * Update README_EN.md * Update README_EN.md * Changes to be committed: modified: .gitignore modified: README.md modified: README_EN.md new file: assets/EmoLLM_transparent.png deleted: assets/Shusheng.jpg new file: assets/Shusheng.png new file: assets/aiwei_demo1.gif new file: assets/aiwei_demo2.gif new file: assets/aiwei_demo3.gif new file: assets/aiwei_demo4.gif * Update README.md rectify aiwei_demo.gif * Update README.md rectify aiwei_demo style * Changes to be committed: modified: README.md modified: README_EN.md * Changes to be committed: modified: README.md modified: README_EN.md * [Doc] update readme * [Doc] update readme * Update README.md * Update README_EN.md * Update README.md * Update README_EN.md * Delete datasets/mother_v1_2439.json * Rename mother_v2_3838.json to mother_v2.json * Delete datasets/mother_v2.json * Add files via upload * Update README.md * Update README_EN.md * [Doc] Update README_EN.md minor fix * InternLM2-Base-7B QLoRA微调模型 链接和测评结果更新 * add download_model.py script, automatic download of model libraries * 清除图片的黑边、更新作者信息 modified: README.md new file: assets/aiwei_demo.gif deleted: assets/aiwei_demo1.gif modified: assets/aiwei_demo2.gif modified: assets/aiwei_demo3.gif modified: assets/aiwei_demo4.gif * rectify aiwei_demo transparent * transparent * modify: aiwei_demo table--->div * modified: aiwei_demo * modify: div ---> table * modified: README.md * modified: README_EN.md * update model config file links * Create internlm2_20b_chat_lora_alpaca_e3.py 20b模型的配置文件 * update model config file links update model config file links * Revert "update model config file links" --------- Co-authored-by: jujimeizuo <fengzetao.zed@foxmail.com> Co-authored-by: xzw <62385492+aJupyter@users.noreply.github.com> Co-authored-by: Zeyu Ba <72795264+ZeyuBa@users.noreply.github.com> Co-authored-by: Bryce Wang <90940753+brycewang2018@users.noreply.github.com> Co-authored-by: zealot52099 <songyan5209@163.com> Co-authored-by: HongCheng <kwchenghong@gmail.com> Co-authored-by: Yicong <yicooong@qq.com> Co-authored-by: Yicooong <54353406+Yicooong@users.noreply.github.com> Co-authored-by: aJupyter <ajupyter@163.com> Co-authored-by: MING_X <119648793+MING-ZCH@users.noreply.github.com> Co-authored-by: Ikko Eltociear Ashimine <eltociear@gmail.com> Co-authored-by: HatBoy <null2none@163.com> Co-authored-by: ZhouXinAo <142309012+zxazys@users.noreply.github.com>
2024-04-14 10:09:17 +08:00
import time
import os
def safe_remove(file_path, max_attempts=5, delay=1):
for attempt in range(max_attempts):
try:
os.remove(file_path)
print(f"File {file_path} successfully deleted.")
break
except PermissionError as e:
print(f"Attempt {attempt+1}: Unable to delete {file_path} - {str(e)}")
time.sleep(delay)
else:
print(f"Failed to delete {file_path} after {max_attempts} attempts.")
2024-03-16 13:12:15 +08:00
"""
将并发产生的文件合并成为一个文件
"""
def merge_sub_qa_generation(directory, storage_jsonl_path):
# 查找以指定前缀开始的所有文件
matching_files = glob.glob(os.path.join(directory, storage_jsonl_path + "*"))
file_contents = []
for file_path in matching_files:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
file_contents.append(json.loads(line))
Update code (#8) * feat: add agents/actions/write_markdown * [ADD] add evaluation result of base model on 5/10 epochs * Rename mother.json to mother_v1_2439.json * Add files via upload * [DOC] update README * Update requirements.txt update mpi4py installation * Update README_EN.md update English comma * Update README.md 基于母亲角色的多轮对话模型微调完毕。已上传到 Huggingface。 * 多轮对话母亲角色的微调的脚本 * Update README.md 加上了王几行XING 和 思在 的作者信息 * Update README_EN.md * Update README.md * Update README_EN.md * Update README_EN.md * Changes to be committed: modified: .gitignore modified: README.md modified: README_EN.md new file: assets/EmoLLM_transparent.png deleted: assets/Shusheng.jpg new file: assets/Shusheng.png new file: assets/aiwei_demo1.gif new file: assets/aiwei_demo2.gif new file: assets/aiwei_demo3.gif new file: assets/aiwei_demo4.gif * Update README.md rectify aiwei_demo.gif * Update README.md rectify aiwei_demo style * Changes to be committed: modified: README.md modified: README_EN.md * Changes to be committed: modified: README.md modified: README_EN.md * [Doc] update readme * [Doc] update readme * Update README.md * Update README_EN.md * Update README.md * Update README_EN.md * Delete datasets/mother_v1_2439.json * Rename mother_v2_3838.json to mother_v2.json * Delete datasets/mother_v2.json * Add files via upload * Update README.md * Update README_EN.md * [Doc] Update README_EN.md minor fix * InternLM2-Base-7B QLoRA微调模型 链接和测评结果更新 * add download_model.py script, automatic download of model libraries * 清除图片的黑边、更新作者信息 modified: README.md new file: assets/aiwei_demo.gif deleted: assets/aiwei_demo1.gif modified: assets/aiwei_demo2.gif modified: assets/aiwei_demo3.gif modified: assets/aiwei_demo4.gif * rectify aiwei_demo transparent * transparent * modify: aiwei_demo table--->div * modified: aiwei_demo * modify: div ---> table * modified: README.md * modified: README_EN.md * update model config file links * Create internlm2_20b_chat_lora_alpaca_e3.py 20b模型的配置文件 * update model config file links update model config file links * Revert "update model config file links" --------- Co-authored-by: jujimeizuo <fengzetao.zed@foxmail.com> Co-authored-by: xzw <62385492+aJupyter@users.noreply.github.com> Co-authored-by: Zeyu Ba <72795264+ZeyuBa@users.noreply.github.com> Co-authored-by: Bryce Wang <90940753+brycewang2018@users.noreply.github.com> Co-authored-by: zealot52099 <songyan5209@163.com> Co-authored-by: HongCheng <kwchenghong@gmail.com> Co-authored-by: Yicong <yicooong@qq.com> Co-authored-by: Yicooong <54353406+Yicooong@users.noreply.github.com> Co-authored-by: aJupyter <ajupyter@163.com> Co-authored-by: MING_X <119648793+MING-ZCH@users.noreply.github.com> Co-authored-by: Ikko Eltociear Ashimine <eltociear@gmail.com> Co-authored-by: HatBoy <null2none@163.com> Co-authored-by: ZhouXinAo <142309012+zxazys@users.noreply.github.com>
2024-04-14 10:09:17 +08:00
# safe_remove(file_path)
2024-03-16 20:45:30 +08:00
save_to_file(storage_jsonl_path, file_contents)