zoukankan      html  css  js  c++  java
  • python抓取不得姐动图(报错 urllib.error.HTTPError: HTTP Error 403: Forbidden)

    抓取不得姐动图(报错)

    # -*- coding:utf-8 -*-
    #__author__ :kusy
    #__content__:文件说明
    #__date__:2018/7/23 17:01
    import urllib.request
    import re
    
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        # print(html)
        return html
    
    def getImg(reg,savePath):
        iCnt = 0
        def giveImg(html):
            imgre = re.compile(reg)
            imglist = re.findall(imgre, html.decode('utf-8'))
            nonlocal iCnt
            for imgurl in imglist:
                urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
                iCnt += 1
        return giveImg
    
    
    # html = getHtml("http://pic.sogou.com/")
    # reg = r'"image":"(.+?)"'  #sougou
    
    reg = r'data-original="(.+?.gif)"'
    savePath = 'image/gif/'
    g = getImg(reg,savePath)
    for i in range(10):
        if i >1:
            print("http://www.budejie.com/" + str(i))
            html = getHtml("http://www.budejie.com/" + str(i))
        else:
            html = getHtml("http://www.budejie.com/")
        g(html)

    报错如下

    E:kusypythonvenvScriptspython.exe E:/kusy/python/getJpg.py
    http://www.budejie.com/2
    Traceback (most recent call last):
      File "E:/kusy/python/getJpg.py", line 35, in <module>
        html = getHtml("http://www.budejie.com/" + str(i))
      File "E:/kusy/python/getJpg.py", line 9, in getHtml
        page = urllib.request.urlopen(url)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 223, in urlopen
        return opener.open(url, data, timeout)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 532, in open
        response = meth(req, response)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 642, in http_response
        'http', request, response, code, msg, hdrs)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 570, in error
        return self._call_chain(*args)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 504, in _call_chain
        result = func(*args)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 650, in http_error_default
        raise HTTPError(req.full_url, code, msg, hdrs, fp)
    urllib.error.HTTPError: HTTP Error 403: Forbidden
    
    Process finished with exit code 1

     

    百度了下已解决:

    # -*- coding:utf-8 -*-
    #__author__ :kusy
    #__content__:文件说明
    #__date__:2018/7/23 17:01
    import urllib.request
    import re
    
    def getHtml(url):
        # 如果不加上下面的这行出现会出现urllib.error.HTTPError: HTTP Error 403: Forbidden错误
        # 主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url,headers=headers)
        page = urllib.request.urlopen(req)
        html = page.read()
        # print(html)
        return html
    
    def getImg(reg,savePath):
        iCnt = 0
        def giveImg(html):
            imgre = re.compile(reg)
            imglist = re.findall(imgre, html.decode('utf-8'))
            nonlocal iCnt
            for imgurl in imglist:
                urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
                iCnt += 1
        return giveImg
    
    
    # html = getHtml("http://pic.sogou.com/")
    # reg = r'"image":"(.+?)"'  #sougou
    
    reg = r'data-original="(.+?.gif)"'
    savePath = 'image/gif/'
    g = getImg(reg,savePath)
    for i in range(10):
        if i >1:
            print("http://www.budejie.com/" + str(i))
            html = getHtml("http://www.budejie.com/" + str(i))
        else:
            html = getHtml("http://www.budejie.com/")
        g(html)

    下载成功

  • 相关阅读:
    POJ-1189 钉子和小球(动态规划)
    POJ-1191-棋盘分割(动态规划)
    Java实现 LeetCode 730 统计不同回文子字符串(动态规划)
    Java实现 LeetCode 730 统计不同回文子字符串(动态规划)
    Java实现 LeetCode 729 我的日程安排表 I(二叉树)
    Java实现 LeetCode 729 我的日程安排表 I(二叉树)
    Java实现 LeetCode 729 我的日程安排表 I(二叉树)
    Java实现 LeetCode 728 自除数(暴力)
    Java实现 LeetCode 728 自除数(暴力)
    Java实现 LeetCode 728 自除数(暴力)
  • 原文地址:https://www.cnblogs.com/kusy/p/9357360.html
Copyright © 2011-2022 走看看