olivebot/llm/agent/tools/WebPageScraper.py

35 lines
1.3 KiB
Python
Raw Normal View History

from bs4 import BeautifulSoup
import abc
from typing import Any
from langchain.tools import BaseTool
import requests
class WebPageScraper(BaseTool, abc.ABC):
name: str = "WebPageScraper"
description: str = "此工具用于获取网页内容使用时请传入需要查询的网页地址作为参数https://www.baidu.com/。"
def __init__(self):
super().__init__()
async def _arun(self, *args: Any, **kwargs: Any) -> Any:
# 用例中没有用到 arun 不予具体实现
pass
def _run(self, para) -> str:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
response = requests.get(para, headers=headers, timeout=10, verify=True)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
except requests.exceptions.SSLCertVerificationError:
return 'SSL证书验证失败'
except requests.exceptions.Timeout:
return '请求超时'
except Exception as e:
print("Http Error:", e)
return '无法获取该网页内容'
if __name__ == "__main__":
tool = WebPageScraper()
result = tool.run("https://book.douban.com/review/14636204")
print(result)