zoukankan      html  css  js  c++  java
  • scrapy-redis源码抛析

    #scrapy-redis--->queue.py-->class FifoQueue 队列 LifoQueue(lastinfirstout栈)
    
    #self.server父类Base中链接字符串 ---》 LifoQueue _encode_reques---》Base---》_encode_request--》serializer--->picklecompat
    
    #picklecompat-->/usr/local/lib/python3.6/site-packages/scrapy_redis/picklecompat.py
    # def loads(s):
    #     return pickle.loads(s)
    #
    #
    # def dumps(obj):
    #     return pickle.dumps(obj, protocol=-1)
    
    #通过pickle做的serializer
    
    
    
    #server来源
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/connection.py
    
    # 通过 def get_redis---》defaults.REDIS_CLS--》/usr/local/lib/python3.6/site-packages/scrapy_redis/defaults.py
    #redis.StrictRedis--># import redis #redis.Redis --->/usr/local/lib/python3.6/site-packages/redis/client.py --->redis继承了 StrictRedis
    
    
    #def get_redis
    #     if url:
    #         return redis_cls.from_url(url, **kwargs)
    #     else:
    #         return redis_cls(**kwargs)
    # 有url通过url实例化 没有url就直接加括号实例化--》拿到server
    
    
    #调度器
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/scheduler.py
    #def enqueue_request--->self.queue.push(request) 放东西
    #def next_request -----request = self.queue.pop(block_pop_timeout) 取东西
    #def enqueue_request--->self.df.request_seen(request)判断是否存在
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/dupefilter.py  def request_seen
    #self.server.sadd往集合里添加 根据self.server.sadd的返回值True就是如果是0就没加进去就是有已经添加了,如果是1就是没看到过并添加了
    
    #这里面
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/queue.py  LifoQueue pop push
    
    #
    
    #dupefilter.py pipelines.py scheduler.py是组件里做的事
    #default.py默认值
    #picklecompat.py用什么进行序列化
    #scheduler.py调用的queue.py
    
    #spiders.py爬虫 def start_requests--》self.next_requests
    # use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
    # fetch_one = self.server.spop if use_set else self.server.lpop
    #就是去redis取启始url 如果用spiders就需要redis中预先存好启始url 如果不用这个spider我们自己需要手动添加启始url
    #
    #utiles公共的
    
    
    #链接redis
    #如果有url url优先链接 否则就是ip
    # 配置到settings里面
    #http://www.cnblogs.com/wupeiqi/articles/6912807.html
    
    # REDIS_HOST = 'localhost'                            # 主机名
    # REDIS_PORT = 6379                                   # 端口
    # REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
    # REDIS_PARAMS  = {}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
    # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块  默认:redis.StrictRedis
    # REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
    
    
    
    #调度器配置
    SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'  # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
    SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key
    # v='%(spider)s:requests'
    # val=v%{'spider':'hahah'}
    # val
    # 'hahah:requests'
    #每个爬虫,都有自己在scrapy-redis中的队列,在redis中对应的一个key
    #renjian:requests:{}/[] 选择是{}/[]是根据PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) 来定的
    
    #放任务的
    #renjian:requests:['http://www.baidu.com',]
    #jianren:requests:['http://www.chouti.com',]
    #/Users/shuanggai/PycharmProjects/git/python/D20171113/scrapy-redis/day115/dabo/dabo/spiders/renjian.py中的name = 'renjian' 用于做格式化
    
    # SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化,默认使用pickle
    # SCHEDULER_PERSIST = True  # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
    # 测试时清空 线上不清空
    # SCHEDULER_FLUSH_ON_START = True  # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
    # SCHEDULER_IDLE_BEFORE_CLOSE = 10  # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/scheduler.py--》def next_request --》self.idle_before_close
    #idle_before_close=0,默认是0
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/queue.py-->def pop-->data = self.server.blpop(self.key, timeout)
    #lpop取回的是元祖 超时就是体现在这里
    
    # SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则,在redis中保存时对应的key
    # SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类
    #访问记录 类型是集合
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/dupefilter.py
    #def request_seen ---》self.server.sadd
    """
    renjian:dupefilter:{}
    jianren:dupefilter:{}
    """
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/dupefilter.py --》def request_seen---》fp = self.request_fingerprint(request)
    #/usr/local/lib/python3.6/site-packages/scrapy/utils/request.py--》def request_fingerprint(request, include_headers=None):
    from scrapy.utils.request import request_fingerprint
    from scrapy.http import Request
    r1=Request(url='http://www.baidu.com?id=1&page=2',headers={'k1':'v1'})
    r1_str = request_fingerprint(r1,include_headers=['k1']) #把对象转化成字符串
    print(r1_str) #75d6587d87b3f4f3aa574b33dbd69ceeb9eafe7b
    r2=Request(url='http://www.baidu.com?page=2&id=1',headers={'k1':'v2'})
    r2_str = request_fingerprint(r2,include_headers=['k1'])
    print(r2_str)
    #默认  是否加请求头和参数位置不同不影响值,但是参数的增加或减少会影响值 除非添加include_headers
    
    #重要
    ## 利用调度器使用scrapy_redis
    SCHEDULER="scrapy_redis.scheduler.Scheduler"
    #去重的记录使用scrapy_redis
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    
    #数据持久化
    # from scrapy_redis.pipelines import RedisPipeline
    # ITEM_PIPELINES = {
    #    'dabo.pipelines.DaboPipeline': 300,
    #    'dabo.pipelines.XiaoboPipeline': 400,
    # }
    from scrapy_redis.pipelines import RedisPipeline
    ITEM_PIPELINES = {
       'scrapy_redis.pipelines.RedisPipeline': 300,
    }
    REDIS_ITEMS_KEY = '%(spider)s:items'
    REDIS_ITEMS_SERIALIZER = 'json.dumps'
    
    #启始url
    REDIS_START_URLS_AS_SET = False
    #True就是集合False就是列表
    REDIS_START_URLS_KEY = '%(name)s:start_urls'
    
    
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/spiders.py 获取start_url 所以需要在redis先添加url
    #def start_requests-->self.next_requests
    # use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
    # fetch_one = self.server.spop if use_set else self.server.lpop
    #conn.lpush('renjian:start_urls','http://www.chouti.com')
    
    #/usr/local/lib/python3.6/site-packages/scrapy_redis/pipelines.py--->class RedisPipeline-->def process_item-->deferToThread(self._process_item, item, spider)
    
    
    # class Base(object):
    #     """Per-spider base queue class"""
    #
    #     def __init__(self, server, spider, key, serializer=None):
    #         """Initialize per-spider redis queue.
    #
    #         Parameters
    #         ----------
    #         server : StrictRedis
    #             Redis client instance.
    #         spider : Spider
    #             Scrapy spider instance.
    #         key: str
    #             Redis key where to put and get messages.
    #         serializer : object
    #             Serializer object with ``loads`` and ``dumps`` methods.
    #
    #         """
    #         if serializer is None:
    #             # Backward compatibility.
    #             # TODO: deprecate pickle.
    #             serializer = picklecompat
    #         if not hasattr(serializer, 'loads'):
    #             raise TypeError("serializer does not implement 'loads' function: %r"
    #                             % serializer)
    #         if not hasattr(serializer, 'dumps'):
    #             raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
    #                             % serializer)
    #
    #         self.server = server
    #         self.spider = spider
    #         self.key = key % {'spider': spider.name}
    #         self.serializer = serializer
    #
    #     def _encode_request(self, request):
    #         """Encode a request object"""
    #         obj = request_to_dict(request, self.spider)
    #         return self.serializer.dumps(obj)
    
    
    
    # class LifoQueue(Base):
    #     """Per-spider LIFO queue."""
    #
    #     def __len__(self):
    #         """Return the length of the stack"""
    #         return self.server.llen(self.key)
    #
    #     def push(self, request):
    #         """Push a request"""
    #         self.server.lpush(self.key, self._encode_request(request))
    #
    #     def pop(self, timeout=0):
    #         """Pop a request"""
    #         if timeout > 0:
    #             data = self.server.blpop(self.key, timeout)
    #             if isinstance(data, tuple):
    #                 data = data[1]
    #         else:
    #             data = self.server.lpop(self.key)
    #
    #         if data:
    #             return self._decode_request(data)
    #
    #
    #
    #  if serializer is None:
    #             # Backward compatibility.
    #             # TODO: deprecate pickle.
    #             serializer = picklecompat
    #         if not hasattr(serializer, 'loads'):
    #             raise TypeError("serializer does not implement 'loads' function: %r"
    #                             % serializer)
    #         if not hasattr(serializer, 'dumps'):
    #             raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
    #                             % serializer)
    # def get_redis(**kwargs):
    #     """Returns a redis client instance.
    #
    #     Parameters
    #     ----------
    #     redis_cls : class, optional
    #         Defaults to ``redis.StrictRedis``.
    #     url : str, optional
    #         If given, ``redis_cls.from_url`` is used to instantiate the class.
    #     **kwargs
    #         Extra parameters to be passed to the ``redis_cls`` class.
    #
    #     Returns
    #     -------
    #     server
    #         Redis client instance.
    #
    #     """
    #     redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS)
    #     url = kwargs.pop('url', None)
    #     if url:
    #         return redis_cls.from_url(url, **kwargs)
    #     else:
    #         return redis_cls(**kwargs)
    

      

  • 相关阅读:
    剧集更新表
    Pyhton资源
    JAVA资源
    012 循环
    011 条件判断
    010 使用list和tuple
    009 字符串和编码
    007 Python基础
    python 内置函数
    python 获取当前运行的类名函数名inspect.stack()[1][3]
  • 原文地址:https://www.cnblogs.com/morgana/p/7840794.html
Copyright © 2011-2022 走看看