zoukankan      html  css  js  c++  java
  • requests获取所有状态码

    requests获取所有状态码

    requests默认是不会获取301/302的状态码的。可以设置allow_redirects=False,这样就可以获取所有的状态码了

    import requests
    
    # url
    # url = 'http://www.freebuf.com/news/157100.html'  # 请求200,返回200
    url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302,返回200。要想不跳转,获取302,用参数:allow_redirects=False
    # url = 'http://www.freebuf.com/articles/database/151839.html'  # 请求403,返回403
    # url = 'http://www.freebuf.com/articles/database/1518391.html'  # 请求存在的域名中不存在的页面,请求404,返回404
    # url = 'http://www.freebudfsf.com/articles/database/1518391.html'  # 请求不存在的域名。程序崩溃
    # url = 'https://www.douban.com/group/topic/49606658/'  # 请求存在的域名,公司限制访问,返回抛出异常,程序崩溃。效果和网络中断一样。
    # url = 'http://10.1.75.241'  # 请求ip,(一定要加协议HTTP,否则崩溃)
    # headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    try:
        # 发请求,得响应
        response = requests.get(url, headers=headers, allow_redirects=False)
        # 解析
        print('    give url:', url)
        print(' request.url:', response.request.url)
        print('response.url:', response.url)
        print(response.content)
        print(response.status_code)
    except Exception as e:
        print(e)
    

      

    封装一个获取所有状态码的函数,同时实现验证返回值的方法

    import requests
    
    
    def get_statecode_or_errinfo(url=''):
        '''
        获取响应状态码,或者未响应的错误信息
        :param url: 请求的url
        :return: 状态码,或者未响应的错误信息
        '''
        if url == '':
            return '请输入一个url作为get_statecode_or_errinfo的参数'
        # headers
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }
        try:
            # 发请求,得响应
            response = requests.get(url, headers=headers, allow_redirects=False)
            # 返回状态码
            return response.status_code
        except Exception as e:
            # 返回异常信息
            return e
    
    
    if __name__ == '__main__':
        # url
        # url = 'http://www.freebuf.com/news/157100.html'  # 请求200,返回200
        url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302,返回200。要想不跳转,获取302,用参数:allow_redirects=False
        # url = 'http://www.freebuf.com/articles/database/151839.html'  # 请求403,返回403
        # url = 'http://www.freebuf.com/articles/database/1518391.html'  # 请求存在的域名中不存在的页面,请求404,返回404
        # url = 'http://www.freebudfsf.com/articles/database/1518391.html'  # 请求不存在的域名。程序崩溃。如果有Nginx,返回200
        # url = 'http://dsfs'  # 请求不存在的域名,设置了参数:allow_redirects=False,在有Nginx处理的情况下,有304,返回200。
        # url = 'https://www.douban.com/group/topic/49606658/'  # 请求存在的域名,公司限制访问,返回抛出异常,程序崩溃。效果和网络中断一样。
        # url = 'http://10.1.75.241'  # 请求ip,请求200,返回200(一定要加协议HTTP,否则崩溃)
    
    
    
        # url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302,返回200。要想不跳转,获取302,用参数:allow_redirects=False
        url = 'http://www.freebuf.com/news/171238.html'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }
        # response=requests.get(url,headers=headers,allow_redirects=False)
        response=requests.get(url,headers=headers)
        # 检查状态码
        print(response.status_code)
    
        # # 检查url
        print(url)
        print(response.url)
        #
        # # 检查请求头
        print(response.request.headers)
        #
        # # 检查响应头
        print(response.headers)
        #
        # # 检查源码
        # print(response.content)
        # print(response.content.decode())
        # print(response.text)
        #
        # response.encoding = 'utf-8'
        # print(response.text)
        print(response.encoding)
        #
        # # 检查源码字符串长度
        print(len(response.content))
    

      

    说明:

    反扒:
    总结多种验证返回值的方式。requests
    比如:检查状态码、检查url(有可能发送了跳转)、检查请求头、检查响应头、检查源码、检查源码字符串长度。
    检查状态码
    print (response.status_code)
    检查url
    print (response.url)
    检查请求头
    print (response.request.headers)
    检查响应头
    print (response.headers)
    检查源码字符串长度
    print (len(response.content))
    检查源码
    print (response.content)
    print (response.content.decode())
    response.encoding='utf-8'
    print (response.text)
    print (response.encoding)

    scrapy爬虫的响应规则:

    # 1、被过滤掉,不发出请求:不在允许的域名范围内
    # temp['title_url'] = "https://www.baidu.com/"  # 跨域。请求发出前,url直接被过滤掉。
    # temp['title_url'] = "http://open.freebuf.com/live?id=1021"  # 跨域。请求发出前,url直接被过滤掉。
    # temp['title_url'] = "http://10.1.75.241"  # 请求ip地址,请求发出前,url直接过来掉。如果设置为允许ip网站,没有被过滤,就返回200
    
    # 2、禁止访问
    # temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403,资源存在,不让访问。Ignoring non-200 response
    # temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404,资源本身不存在。Ignoring non-200 response
    
    # 3、重定向后的作为新请求
    # temp['title_url'] = "http://www.freebuf.com/news/156654.html"  # 重定向301、302。会返回重定向后200的状态码
    
    # 4、断网
    # temp['title_url'] = "https://www.douban.com/group/topic/49606658/"  # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
    
    # 5、没有的网站
    # temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/"  # 直接被过滤掉,如果没有被过滤,就返回域名解析错误:DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.
    pass
    

      

     scrapy爬虫举例

    freebuf2.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy_FB.items import ScrapyFb2Item
    
    
    # from util.logger import Logger
    
    # logger_freebuf2 = Logger(logname=__name__, logpath='collection_log', logformat=1, loglevel=10).getlog()
    # logger_freebuf2.debug('i am debug3')
    # logger_freebuf2.info('i am info3')
    # logger_freebuf2.warning('i am warning3')
    
    
    class Freebuf2Spider(scrapy.Spider):
        # freebuf2爬虫
        name = 'freebuf2'
        allowed_domains = ['freebuf.com','douban.com']
        start_urls = ['http://www.freebuf.com/page/708']
    
        def parse(self, response):
            cur_url = response.url  # 当前列表页url
            cur_page_num = int(cur_url.rpartition('/')[-1])  # 当前page num
    
            print('cur_url:%s' % cur_url)
            print('cur_page_num:%s' % cur_page_num)
    
            # 获取列表节点
            node_list = response.xpath('//*[@id="timeline"]/div/div[2]/dl/dt/a[1]')
            print('len(node_list):%s' % len(node_list))
    
            page_num = int(cur_url.rpartition('/')[-1])  # 当前页码
            count_node = len(node_list)  # 当前列表页,一共有的详细页条数
    
            # 遍历节点
            for i, node in enumerate(node_list):
                # temp = {}
                temp = ScrapyFb2Item()
                temp['title'] = node.xpath('./text()').extract()[0].strip()
                if i == 0:
                    # 1、被过滤掉,不发出请求:不在允许的域名范围内
                    # temp['title_url'] = "https://www.baidu.com/"  # 跨域。请求发出前,url直接被过滤掉。
                    # temp['title_url'] = "http://open.freebuf.com/live?id=1021"  # 跨域。请求发出前,url直接被过滤掉。
                    # temp['title_url'] = "http://10.1.75.241"  # 请求ip地址,请求发出前,url直接过来掉。如果设置为允许ip网站,没有被过滤,就返回200
    
                    # 2、禁止访问
                    # temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403,资源存在,不让访问。Ignoring non-200 response
                    # temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404,资源本身不存在。Ignoring non-200 response
    
                    # 3、重定向后的作为新请求
                    # temp['title_url'] = "http://www.freebuf.com/news/156654.html"  # 重定向301、302。会返回重定向后200的状态码
    
                    # 4、断网
                    # temp['title_url'] = "https://www.douban.com/group/topic/49606658/"  # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
    
                    # 5、没有的网站
                    # temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/"  # 直接被过滤掉,如果没有被过滤,就返回域名解析错误:DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.
                    pass
    
    
    
                else:
                    temp['title_url'] = node.xpath('./@href').extract()[0]
                temp['page_num'] = str(page_num)
                temp['line_num'] = i + 1
                temp['line_total'] = str(count_node)
                # print(temp['line_num'])
                yield scrapy.Request(temp['title_url'], callback=self.parse_detail, meta={"meta_1": temp}, errback=self.err)
    
            if len(node_list) != 0:  # 爬虫不终止的条件
                # 下一页
                next_url = 'http://www.freebuf.com/page/{}'.format(cur_page_num + 1)
                # print('next_url:%s' % next_url)
                yield scrapy.Request(next_url, callback=self.parse)  # 访问下一页
    
        def parse_detail(self, response):
            item = response.meta['meta_1']
    
            print(item['line_num'], item['title_url'])
            # print(response.status)
            print(item['line_num'], response.request.url)
    
    
        def err(self, response):
            print('err:',response.request.url)
            # print('err:',response.status)
            # print(dir(response))
            print('err:',response.getErrorMessage())
            print(dir(response))
            # print(type(response.getErrorMessage()))
    

      

  • 相关阅读:
    Android四大基本组件介绍与生命周期
    TRIZ系列-创新原理-23-反馈原理
    hibernate之6.one2many单向
    软件评測师真题考试分析-5
    WAS集群系列(3):集群搭建:步骤1:准备文件
    Android Developer:合并清单文件
    移动均值滤波与中值滤波
    使用React的static方法实现同构以及同构的常见问题
    mysql合并同一列的值
    iOS开发
  • 原文地址:https://www.cnblogs.com/andy9468/p/8401069.html
Copyright © 2011-2022 走看看