1.先在浏览器中检查charset 例如<meta charset="utf-8">
,'utf-8'比较常见,也有“gbk”,
对get返回的对象的编码,进行相应的调整
2.尝试使用content属性代替text
3.对于局部乱码(多为中文),例如img标签中的“alt"属性可尝试使用‘’iso-8859-1"进行编码,然后使用‘’gbk"进行 解码
import requests
from urllib import request
from bs4 import BeautifulSoup
url_model = 'http://pic.netbian.com/4kdongman/index_{index}.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
pre = 'http://pic.netbian.com'
for i in range(2,6):
url = url_model.format(index = i)
response = requests.get(url = url,headers = headers)
response.encoding = 'gbk' #---------------- 1
ht = response.text
exit()
soup = BeautifulSoup(ht,'lxml')
tagImg = soup.select('.slist > ul > li > a')
for j in tagImg:
imgurl = pre + j['href']
response1 = requests.get(url = imgurl,headers = headers)
ht1 = response1.content #使用text属性乱码(先使用gbk编码没问题) # -----------2
soup1 = BeautifulSoup(ht1,'lxml')
ImgDate = soup1.select('#img > img')[0]
final_url = 'http://pic.netbian.com' + ImgDate['src']
imgname = ImgDate['alt'] +'.jpg' # ---------------3
imgpath = './图片/' + imgname
request.urlretrieve(final_url,imgpath)