1、使用库:request、BeautifulSoup
2、request
esponse =requests.get(
url='https://www.autohome.com.cn/news/'
)
response.encoding = response.apparent_encoding
response.text
response.content
response.status_code
3、BeautifulSoup
转换成soup对象
soup = BeautifulSoup(response.text,features='html.parser') #默认用html.parser,生产用lxml,性能更好
根据id查找
soup.find(id="chazy")
查找li、div、img等html标签下的文本
target = soup.find(id="auto-channel-lazyload-article").find('li') # 找到第一个li
li_list = soup.find(id="auto-channel-lazyload-article").find_all('li') # 找到所有li
4、简单示例
import requests
from bs4 import BeautifulSoup
response =requests.get(
url='https://www.autohome.com.cn/news/'
)
response.encoding = response.apparent_encoding
print(response.status_code)
soup = BeautifulSoup(response.text,features='html.parser') #默认用html.parser,生产用lxml,性能更好
#正则查找
target = soup.find(id="auto-channel-lazyload-article").find('li') # 找到第一个li
li_list = soup.find(id="auto-channel-lazyload-article").find_all('li') # 找到所有li
for li in li_list:
a = li.find('a') #找a标签
if(a):
pass
print(a.attrs)
print(a.attrs.get('href'))
img = li.find('img').get('src')
res = requests.get(img)
file_name = "%s.jpg" %(title,)
with open(file_name,'wb') as f:
f.write(res.content)