35 lines
1.3 KiB
Python
35 lines
1.3 KiB
Python
|
from bs4 import BeautifulSoup
|
|||
|
import abc
|
|||
|
from typing import Any
|
|||
|
from langchain.tools import BaseTool
|
|||
|
import requests
|
|||
|
|
|||
|
class WebPageScraper(BaseTool, abc.ABC):
|
|||
|
name: str = "WebPageScraper"
|
|||
|
description: str = "此工具用于获取网页内容,使用时请传入需要查询的网页地址作为参数,如:https://www.baidu.com/。"
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
super().__init__()
|
|||
|
|
|||
|
async def _arun(self, *args: Any, **kwargs: Any) -> Any:
|
|||
|
# 用例中没有用到 arun 不予具体实现
|
|||
|
pass
|
|||
|
|
|||
|
def _run(self, para) -> str:
|
|||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
|||
|
try:
|
|||
|
response = requests.get(para, headers=headers, timeout=10, verify=True)
|
|||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|||
|
return soup
|
|||
|
except requests.exceptions.SSLCertVerificationError:
|
|||
|
return 'SSL证书验证失败'
|
|||
|
except requests.exceptions.Timeout:
|
|||
|
return '请求超时'
|
|||
|
except Exception as e:
|
|||
|
print("Http Error:", e)
|
|||
|
return '无法获取该网页内容'
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
tool = WebPageScraper()
|
|||
|
result = tool.run("https://book.douban.com/review/14636204")
|
|||
|
print(result)
|