zoukankan      html  css  js  c++  java
  • scrapy-redis使redis不止保存url

    先看scrapy-redis源码

      1 class RedisMixin(object):
      2     """Mixin class to implement reading urls from a redis queue."""
      3     redis_key = None
      4     redis_batch_size = None
      5     redis_encoding = None
      6 
      7     # Redis client placeholder.
      8     server = None
      9 
     10     def start_requests(self):
     11         """Returns a batch of start requests from redis."""
     12         return self.next_requests()
     13 
     14     def setup_redis(self, crawler=None):
     15         """Setup redis connection and idle signal.
     16 
     17         This should be called after the spider has set its crawler object.
     18         """
     19         if self.server is not None:
     20             return
     21 
     22         if crawler is None:
     23             # We allow optional crawler argument to keep backwards
     24             # compatibility.
     25             # XXX: Raise a deprecation warning.
     26             crawler = getattr(self, 'crawler', None)
     27 
     28         if crawler is None:
     29             raise ValueError("crawler is required")
     30 
     31         settings = crawler.settings
     32 
     33         if self.redis_key is None:
     34             self.redis_key = settings.get(
     35                 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
     36             )
     37 
     38         self.redis_key = self.redis_key % {'name': self.name}
     39 
     40         if not self.redis_key.strip():
     41             raise ValueError("redis_key must not be empty")
     42 
     43         if self.redis_batch_size is None:
     44             # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
     45             self.redis_batch_size = settings.getint(
     46                 'REDIS_START_URLS_BATCH_SIZE',
     47                 settings.getint('CONCURRENT_REQUESTS'),
     48             )
     49 
     50         try:
     51             self.redis_batch_size = int(self.redis_batch_size)
     52         except (TypeError, ValueError):
     53             raise ValueError("redis_batch_size must be an integer")
     54 
     55         if self.redis_encoding is None:
     56             self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
     57 
     58         self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
     59                          "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
     60                          self.__dict__)
     61 
     62         self.server = connection.from_settings(crawler.settings)
     63         # The idle signal is called when the spider has no requests left,
     64         # that's when we will schedule new requests from redis queue
     65         crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
     66 
     67     def next_requests(self):
     68         """Returns a request to be scheduled or none."""
     69         use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
     70         fetch_one = self.server.spop if use_set else self.server.lpop
     71         # XXX: Do we need to use a timeout here?
     72         found = 0
     73         # TODO: Use redis pipeline execution.
     74         while found < self.redis_batch_size:
     75             data = fetch_one(self.redis_key)
     76             if not data:
     77                 # Queue empty.
     78                 break
     79             req = self.make_request_from_data(data)
     80             if req:
     81                 yield req
     82                 found += 1
     83             else:
     84                 self.logger.debug("Request not made from data: %r", data)
     85 
     86         if found:
     87             self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
     88 
     89     def make_request_from_data(self, data):
     90         """Returns a Request instance from data coming from Redis.
     91 
     92         By default, ``data`` is an encoded URL. You can override this method to
     93         provide your own message decoding.
     94 
     95         Parameters
     96         ----------
     97         data : bytes
     98             Message from redis.
     99 
    100         """
    101         url = bytes_to_str(data, self.redis_encoding)
    102         return self.make_requests_from_url(url)
    103 
    104     def schedule_next_requests(self):
    105         """Schedules a request if available"""
    106         # TODO: While there is capacity, schedule a batch of redis requests.
    107         for req in self.next_requests():
    108             self.crawler.engine.crawl(req, spider=self)
    109 
    110     def spider_idle(self):
    111         """Schedules a request if available, otherwise waits."""
    112         # XXX: Handle a sentinel to close the spider.
    113         self.schedule_next_requests()
    114         raise DontCloseSpider
    115 
    116 
    117 class RedisSpider(RedisMixin, Spider):
    118     """Spider that reads urls from redis queue when idle.
    119 
    120     Attributes
    121     ----------
    122     redis_key : str (default: REDIS_START_URLS_KEY)
    123         Redis key where to fetch start URLs from..
    124     redis_batch_size : int (default: CONCURRENT_REQUESTS)
    125         Number of messages to fetch from redis on each attempt.
    126     redis_encoding : str (default: REDIS_ENCODING)
    127         Encoding to use when decoding messages from redis queue.
    128 
    129     Settings
    130     --------
    131     REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
    132         Default Redis key where to fetch start URLs from..
    133     REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
    134         Default number of messages to fetch from redis on each attempt.
    135     REDIS_START_URLS_AS_SET : bool (default: False)
    136         Use SET operations to retrieve messages from the redis queue. If False,
    137         the messages are retrieve using the LPOP command.
    138     REDIS_ENCODING : str (default: "utf-8")
    139         Default encoding to use when decoding messages from redis queue.
    140 
    141     """
    142 
    143     @classmethod
    144     def from_crawler(self, crawler, *args, **kwargs):
    145         obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
    146         obj.setup_redis(crawler)
    147         return obj
    148 
    149 
    150 class RedisCrawlSpider(RedisMixin, CrawlSpider):
    151     """Spider that reads urls from redis queue when idle.
    152 
    153     Attributes
    154     ----------
    155     redis_key : str (default: REDIS_START_URLS_KEY)
    156         Redis key where to fetch start URLs from..
    157     redis_batch_size : int (default: CONCURRENT_REQUESTS)
    158         Number of messages to fetch from redis on each attempt.
    159     redis_encoding : str (default: REDIS_ENCODING)
    160         Encoding to use when decoding messages from redis queue.
    161 
    162     Settings
    163     --------
    164     REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
    165         Default Redis key where to fetch start URLs from..
    166     REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
    167         Default number of messages to fetch from redis on each attempt.
    168     REDIS_START_URLS_AS_SET : bool (default: True)
    169         Use SET operations to retrieve messages from the redis queue.
    170     REDIS_ENCODING : str (default: "utf-8")
    171         Default encoding to use when decoding messages from redis queue.
    172 
    173     """
    174 
    175     @classmethod
    176     def from_crawler(self, crawler, *args, **kwargs):
    177         obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
    178         obj.setup_redis(crawler)
    179         return obj

    仔细看完的话会发现

    make_request_from_data(self, data)
    这个方法是从redis中返回一个请求实例 默认是一个url
    接下来重写一下这个方法直接传入到
    self.make_requests_from_url
    一个json串就好了
    在这个方法里面可以把这个串解析了请求url或者生产url
    代码如下
     1     def make_request_from_data(self, data):
     2         '''
     3         :params data bytes, Message from redis
     4         '''
     5         company = bytes_to_str(data, self.redis_encoding)
     6         return self.make_requests_from_url(company)
     7 
     8     def make_requests_from_url(self, company):
     9         data = eval(company)
    10         url = data["url"]
    11         headers = {
    12             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
    13             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
    14         }
    15         return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)

    值得注意的是

    不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
    但是同时重写make_request_from_data和make_requests_from_url方法则可以执行
  • 相关阅读:
    1024X768大图 (Wallpaper)
    (Mike Lynch)Application of linear weight neural networks to recognition of hand print characters
    瞬间模糊搜索1000万基本句型的语言算法
    单核与双核的竞争 INTEL P4 670对抗820
    FlashFTP工具的自动缓存服务器目录的功能
    LDAP over SSL (LDAPS) Certificate
    Restart the domain controller in Directory Services Restore Mode Remotely
    How do I install Active Directory on my Windows Server 2003 server?
    指针与指针变量(转)
    How to enable LDAP over SSL with a thirdparty certification authority
  • 原文地址:https://www.cnblogs.com/ltn26/p/10120444.html
Copyright © 2011-2022 走看看