zoukankan      html  css  js  c++  java
  • python抓取不得姐动图(报错 urllib.error.HTTPError: HTTP Error 403: Forbidden)

    抓取不得姐动图(报错)

    # -*- coding:utf-8 -*-
    #__author__ :kusy
    #__content__:文件说明
    #__date__:2018/7/23 17:01
    import urllib.request
    import re
    
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        # print(html)
        return html
    
    def getImg(reg,savePath):
        iCnt = 0
        def giveImg(html):
            imgre = re.compile(reg)
            imglist = re.findall(imgre, html.decode('utf-8'))
            nonlocal iCnt
            for imgurl in imglist:
                urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
                iCnt += 1
        return giveImg
    
    
    # html = getHtml("http://pic.sogou.com/")
    # reg = r'"image":"(.+?)"'  #sougou
    
    reg = r'data-original="(.+?.gif)"'
    savePath = 'image/gif/'
    g = getImg(reg,savePath)
    for i in range(10):
        if i >1:
            print("http://www.budejie.com/" + str(i))
            html = getHtml("http://www.budejie.com/" + str(i))
        else:
            html = getHtml("http://www.budejie.com/")
        g(html)

    报错如下

    E:kusypythonvenvScriptspython.exe E:/kusy/python/getJpg.py
    http://www.budejie.com/2
    Traceback (most recent call last):
      File "E:/kusy/python/getJpg.py", line 35, in <module>
        html = getHtml("http://www.budejie.com/" + str(i))
      File "E:/kusy/python/getJpg.py", line 9, in getHtml
        page = urllib.request.urlopen(url)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 223, in urlopen
        return opener.open(url, data, timeout)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 532, in open
        response = meth(req, response)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 642, in http_response
        'http', request, response, code, msg, hdrs)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 570, in error
        return self._call_chain(*args)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 504, in _call_chain
        result = func(*args)
      File "C:UsersjingjingAppDataLocalProgramsPythonPython36liburllib
    equest.py", line 650, in http_error_default
        raise HTTPError(req.full_url, code, msg, hdrs, fp)
    urllib.error.HTTPError: HTTP Error 403: Forbidden
    
    Process finished with exit code 1

     

    百度了下已解决:

    # -*- coding:utf-8 -*-
    #__author__ :kusy
    #__content__:文件说明
    #__date__:2018/7/23 17:01
    import urllib.request
    import re
    
    def getHtml(url):
        # 如果不加上下面的这行出现会出现urllib.error.HTTPError: HTTP Error 403: Forbidden错误
        # 主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url,headers=headers)
        page = urllib.request.urlopen(req)
        html = page.read()
        # print(html)
        return html
    
    def getImg(reg,savePath):
        iCnt = 0
        def giveImg(html):
            imgre = re.compile(reg)
            imglist = re.findall(imgre, html.decode('utf-8'))
            nonlocal iCnt
            for imgurl in imglist:
                urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
                iCnt += 1
        return giveImg
    
    
    # html = getHtml("http://pic.sogou.com/")
    # reg = r'"image":"(.+?)"'  #sougou
    
    reg = r'data-original="(.+?.gif)"'
    savePath = 'image/gif/'
    g = getImg(reg,savePath)
    for i in range(10):
        if i >1:
            print("http://www.budejie.com/" + str(i))
            html = getHtml("http://www.budejie.com/" + str(i))
        else:
            html = getHtml("http://www.budejie.com/")
        g(html)

    下载成功

  • 相关阅读:
    C. Shaass and Lights 解析(思維、組合)
    D. Binary String To Subsequences(队列)(贪心)
    CodeForces 1384B2. Koa and the Beach (Hard Version)(贪心)
    CodeForces 1384B1. Koa and the Beach (Easy Version)(搜索)
    CodeForces 1384C. String Transformation 1(贪心)(并查集)
    CodeForces 1384A. Common Prefixes
    POJ-2516 Minimum Cost(最小费用最大流)
    POJ3261-Milk Patterns(后缀数组)
    HDU-1300 Pearls(斜率DP)
    HDU-4528 小明系列故事-捉迷藏(BFS)
  • 原文地址:https://www.cnblogs.com/kusy/p/9357360.html
Copyright © 2011-2022 走看看