zoukankan      html  css  js  c++  java
  • python抓取不得姐动图(报错 urllib.error.HTTPError: HTTP Error 403: Forbidden)

    抓取不得姐动图(报错)

    # -*- coding:utf-8 -*-
    #__author__ :kusy
    #__content__:文件说明
    #__date__:2018/7/23 17:01
    import urllib.request
    import re
    
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        # print(html)
        return html
    
    def getImg(reg,savePath):
        iCnt = 0
        def giveImg(html):
            imgre = re.compile(reg)
            imglist = re.findall(imgre, html.decode('utf-8'))
            nonlocal iCnt
            for imgurl in imglist:
                urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
                iCnt += 1
        return giveImg
    
    
    # html = getHtml("http://pic.sogou.com/")
    # reg = r'"image":"(.+?)"'  #sougou
    
    reg = r'data-original="(.+?.gif)"'
    savePath = 'image/gif/'
    g = getImg(reg,savePath)
    for i in range(10):
        if i >1:
            print("http://www.budejie.com/" + str(i))
            html = getHtml("http://www.budejie.com/" + str(i))
        else:
            html = getHtml("http://www.budejie.com/")
        g(html)

    报错如下

    E:kusypythonvenvScriptspython.exe E:/kusy/python/getJpg.py
    http://www.budejie.com/2
    Traceback (most recent call last):
      File "E:/kusy/python/getJpg.py", line 35, in <module>
        html = getHtml("http://www.budejie.com/" + str(i))
      File "E:/kusy/python/getJpg.py", line 9, in getHtml
        page = urllib.request.urlopen(url)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 223, in urlopen
        return opener.open(url, data, timeout)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 532, in open
        response = meth(req, response)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 642, in http_response
        'http', request, response, code, msg, hdrs)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 570, in error
        return self._call_chain(*args)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 504, in _call_chain
        result = func(*args)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 650, in http_error_default
        raise HTTPError(req.full_url, code, msg, hdrs, fp)
    urllib.error.HTTPError: HTTP Error 403: Forbidden
    
    Process finished with exit code 1

     

    百度了下已解决:

    # -*- coding:utf-8 -*-
    #__author__ :kusy
    #__content__:文件说明
    #__date__:2018/7/23 17:01
    import urllib.request
    import re
    
    def getHtml(url):
        # 如果不加上下面的这行出现会出现urllib.error.HTTPError: HTTP Error 403: Forbidden错误
        # 主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url,headers=headers)
        page = urllib.request.urlopen(req)
        html = page.read()
        # print(html)
        return html
    
    def getImg(reg,savePath):
        iCnt = 0
        def giveImg(html):
            imgre = re.compile(reg)
            imglist = re.findall(imgre, html.decode('utf-8'))
            nonlocal iCnt
            for imgurl in imglist:
                urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
                iCnt += 1
        return giveImg
    
    
    # html = getHtml("http://pic.sogou.com/")
    # reg = r'"image":"(.+?)"'  #sougou
    
    reg = r'data-original="(.+?.gif)"'
    savePath = 'image/gif/'
    g = getImg(reg,savePath)
    for i in range(10):
        if i >1:
            print("http://www.budejie.com/" + str(i))
            html = getHtml("http://www.budejie.com/" + str(i))
        else:
            html = getHtml("http://www.budejie.com/")
        g(html)

    下载成功

  • 相关阅读:
    虚拟机安装Linux方案和操作系统启动流程
    CentOS7防止root密码被破解
    子网划分和VLAN
    Python之包的相关
    禁止复制文本的代码 HTML
    asp.net中Session过期设置方法
    CSS+DIV问题!DIV的最小高度问题!
    设置COOKIE过期时间的方法
    网站常见问题及解决方法(div/css)
    ASP.NET中如何删除最近打开的项目和文件的记录
  • 原文地址:https://www.cnblogs.com/kusy/p/9357360.html
Copyright © 2011-2022 走看看