zoukankan      html  css  js  c++  java
  • 爬虫遇到HTTP Error 403的问题

    # coding=gbk
    
    
    from bs4 import BeautifulSoup
    import requests
    import urllib
    x = 1
    y = 1
    
    def crawl(url):
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        global y
        with open(f'F:/pachong/xnt/{y}.txt','w',encoding="utf-8") as f:
            f.write(str(soup))
            y += 1
        yinhuns = soup.select('img')
        print(yinhuns)
        for yh in yinhuns:
            print(yh)
            link = yh.get('src')
            print(link)
            global x    
            urllib.request.urlretrieve(link, f'F:/pachong/xnt/{x}.jpg')
            print(f'正在下载第{x}张图片')
            x += 1
            
    for i in range(1,5):
        url = "https://acg.fi/hentai/23643.htm/" + str(i)
        
        try:
            crawl(url)
        except ValueError as f:
            continue
        except Exception as e:
            print(e)
    
    • 运行程序过程中返回下面结果
    <img alt="A区(ACG.Fi)" class="logo" src="https://acg.fi/logo.png"/>
    https://acg.fi/logo.png
    HTTP Error 403: Forbidden
    
    • 问题有三个

      • 搜索src值的时候,没有搜索到全部符合要找的图片网址
      • 返回的第一个网址出现了403错误,拒绝访问
      • soup.select返回的不是正确的list
    • 思考

      • 有可能所要找的网址中包含中文,无法编译
      • 如果通过正则对,请求的url的text进行,筛选
    #coding=gbk
    from bs4 import BeautifulSoup
    import requests
    import urllib
    x = 1
    
    
    def crawl(url, header):
    	
        res = requests.get(url, headers=header)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        yinhuns = soup.find('div', attrs = {'id':"content-innerText"}).find_all('img',limit=4)
        print(yinhuns)
        
        for yh in yinhuns:
            
            link = yh.get('src')
            global x
            print(x)
            urllib.request.urlretrieve(link, 'F:/pachong/xnt/{}.jpg'.format(x))
            print('正在下载第{0}张图片'.format(x))
            x += 1
    
    
    header = {
    		"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
    			}       
    for i in range(1,5):	
        url = "https://acg.fi/hentai/23643.htm/" + str(i)
        
        try:
            crawl(url, header)
        except ValueError as f:
            continue
        except Exception as e:
            print(e)
    
    • 这个过程用了find(),find_all()方法,依旧没有解决list的问题
    • 后续过程使用urllib.parse.quote对中文部分重新编码,但是urllib.request.urlretrieve依然报错
    • 重新修改后
    #coding=gbk
    
    import requests
    import urllib
    import re
    from PIL import Image
    from io import BytesIO
    x = 1 
    
    # 获取抓取的图片源网址
    def crawl(url, header):
        
        res = requests.get(url, headers=header)
        # 防止被反爬,打开后关闭
        res.close()
        res = res.text
        pattern = re.compile('http.*?apic.*?jpg')
        result = re.findall(pattern, res)
        return result
    
    # 对重编码的网址下载图片
    def down(outs, folder_path):
    	global x
    	for out in outs:
    		# 获取新编码的URL地址
    		res = requests.get(out)
    		# 防止被反爬,打开后关闭
    		res.close()
    		bf = BytesIO()
    		bf.write(res.content)
    		img = Image.open(bf)
    		print(f'正在下载第{x}张图片')
    		img.save(folder_path + f"{x}.jpg")
    		x += 1
    
    # 对获取的图片源网址进行重编码
    def bianma(results):
    	outs = []
    	for s in results:
    		# 用正则筛选出中文部分
    		pattern = re.compile('[u4e00-u9fa5]+')
    		result = re.search(pattern, s)
    		su = result.group(0)
    		# 把中文部分重洗编码
    		li = urllib.parse.quote(su)
    		# 把原URL地址中文部分替换成编码后的
    		out = re.sub(pattern, li, s)
    		outs.append(out)
    	# 对列表进行去重并且按照原来的次序排列
    	outs_cp = sorted(set(outs), key=outs.index)	
    	return outs_cp
    
    def main():
    	try:
    		header = {
    				"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
    					}
    		folder_path = 'F:/pachong/xnt/'
    		for i in range(1,5):
    			url = "https://acg.fi/hentai/23643.htm/" + str(i)
    			results = crawl(url, header)
    			outs = bianma(results)
    			down(outs, folder_path)
    	except Exception as e:
    		print(e)
    
    if __name__ == '__main__':
    	main()
    
    • 对于图片路径中有中文的,可以使用BytesIO和PIL下载图片,证实可以有效解决
    • 几次试验出现[Errno 10054] 远程主机强迫关闭了一个现有的连接,可以在requests.get()后使用close()
    • 程序运行无误,就是有点慢,后期可以使用多线程尝试
  • 相关阅读:
    理解Python中的__init__和__new__
    Python内置数学函数
    Java实现邮箱验证
    Socket通信
    Jvm内存模型
    Java GC如何判断对象是否为垃圾
    ::符号
    替换特殊符号
    有意思的小知识
    有意思的小题目
  • 原文地址:https://www.cnblogs.com/rener0424/p/10970096.html
Copyright © 2011-2022 走看看