Github上的一个小项目,通过requests和bs4获取网页指定网页上的所有链接:
"""
一个小程序,获取指定网页上的所有链接
"""
import requests
from bs4 import BeautifulSoup
url = input("请输入网址:") # 从终端输入网址
if ("https" or "http") in url: # 判定一下
webData = requests.get(url) # 获取网页响应
# print(webData)
else:
webData = requests.get("https://" + url)
webData.encoding = webData.apparent_encoding # 编码
webData.raise_for_status()
# webData.encoding = 'utf-8' # 编码
# print(webData.text)
htmlData = webData.text
# 解析网页数据
# soup = BeautifulSoup(htmlData, 'html.parser')
soup = BeautifulSoup(htmlData, 'lxml')
# print(soup)
# 开始查找网页下所有链接
allLinksFromPage = []
links = soup.find_all('a')
# print(links)
for link in links:
getLink = link.get('href')
allLinksFromPage.append(getLink)
# print(allLinksFromPage)
# 开始存储
with open('myLinks.txt', 'w') as saved:
print(allLinksFromPage[0:10], file=saved) # 保存前十条
saved.close()