zoukankan html css js c++ java

[scrapy-redis] 将scrapy爬虫改造成分布式爬虫 (2)

1. 修改redis设置

redis默认处在protection mode, 修改/etc/redis.conf, protected-mode no, 或者给redis设置密码，

将bind 127.0.0.1这一行用#注释掉

2. 修改爬虫设置

向settings.py加入以下设置

REDIS_URL 为master的ip加上redis的端口号

# For scrapy_redis

# Enables scheduling storing requests queue in redis.
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# Don't cleanup redis queues, allows to pause/resume crawls.
SCHEDULER_PERSIST = True

# Schedule requests using a priority queue. (default)
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 

# Store scraped item in redis for post-processing.
ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 300
}

# Specify the host and port to use when connecting to Redis (optional).
#REDIS_HOST = 'localhost'
#REDIS_PORT = 6379

# Specify the full Redis URL for connecting (optional).
# If set, this takes precedence over the REDIS_HOST and REDIS_PORT settings.
#REDIS_URL = 'redis://user:pass@hostname:9001'
REDIS_URL = 'redis://192.168.1.20:6379' #修改成自己的ip和port

3. 修改爬虫代码

使爬虫继承自RedisSpider

from scrapy_redis.spiders import RedisSpider

class DoubanSpider(RedisSpider):

增加一个redis_key属性，这个属性就是start_urls在redis中的key
注释掉start_urls

#!/usr/bin/python3
# -*- coding: utf-8 -*-


import scrapy
from scrapy import Request
from project_douban.items import Movie

from scrapy_redis.spiders import RedisSpider

class DoubanSpider(RedisSpider):
    name = 'douban'

    allowed_domains = ['douban.com']

    redis_key = "doubanSpider:start_urls"

    #start_urls = ['https://movie.douban.com/top250']

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en',
        'User-Agent' : 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Mobile Safari/537.36',
    }

    custom_settings = {
        'DEFAULT_REQUEST_HEADERS' : headers,
        'REDIRECT_ENABLED' : 'False',
        #'LOG_LEVEL' : 'WARNING',
    }

    def parse(self, response):
        items = response.xpath('//div[@class="item"]')

        for item in items:
            movie = Movie()
            movie['index'] = item.xpath('div//em/text()').extract_first(default = '')
            self.logger.info(movie['index'])

            movie['src'] = item.xpath('div//img/@src').extract_first(default = '')
            self.logger.info(movie['src'])

            movie['title'] = item.xpath('.//div[@class="hd"]/a/span[1]/text()').extract_first(default = '') #.xpath('string(.)').extract()).replace(' ','').replace('xa0',' ').replace('
',' ')
            self.logger.info(movie['title'])

            movie['star'] = item.xpath('.//span[@class="rating_num"]/text()').extract_first(default = '')
            self.logger.info(movie['star'])

            movie['info'] = item.xpath('.//div[@class="bd"]/p').xpath('string(.)').extract_first(default = '').strip().replace(' ','').replace('xa0',' ').replace('
',' ')
            self.logger.info(movie['info'])

            yield movie

        next_url = response.xpath('//span[@class="next"]/a/@href').extract_first(default = '')
        self.logger.info('next_url: ' + next_url)
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url
            yield Request(next_url, headers = self.headers)

log写入文件（optional)

import logging
import os
import time

def get_logger(name, start_time = time.strftime('%Y_%m_%d_%H', time.localtime())):
    path = '/var/log/scrapy-redis/'
    # path = 'baidu_tieba.log'
    if not os.path.exists(path):
        os.makedirs(path)
    log_path = path + start_time

    # 创建一个logger  
    my_logger = logging.getLogger(name)
    my_logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] [%(levelname)s] %(filename)s[line:%(lineno)d] %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')

    # 创建handler，用于写入日志文件  
    handler_info = logging.FileHandler('%s_info.log' % log_path, 'a', encoding='UTF-8')
    handler_info.setLevel(logging.INFO)
    handler_info.setFormatter(formatter)
    my_logger.addHandler(handler_info)
    
    handler_warning = logging.FileHandler('%s_warning.log' % log_path, 'a', encoding='UTF-8')
    handler_warning.setLevel(logging.WARNING)
    handler_warning.setFormatter(formatter)
    my_logger.addHandler(handler_warning)
    
    handler_error = logging.FileHandler('%s_error.log' % log_path, 'a', encoding='UTF-8')
    handler_error.setLevel(logging.ERROR)
    handler_error.setFormatter(formatter)
    my_logger.addHandler(handler_error)
    
    return my_logger

Miscellaneous

RedisSpider vs RedisCrawlSpider

直接看源代码，上文本比较

item	RedisSpider	RedisCrawlSpider
REDIS_START_URLS_AS_SET	default: False	default: True
	继承自Spider	继承自CrawlSpider

scrapy.Spider -> scrapy.CrawlSpider

scrapy.Spider是所有爬虫的基类, scrapy.CrawlSpider基于scrapy.Spider, 增加了rules, 可以设置某种规则，只爬取满足这些规则的网页, RedisCrawlSpider也继承了这一特性

Reference

查看全文

相关阅读:
Why does the memory usage increase when I redeploy a web application?
lsof
Advising controllers with the @ControllerAdvice annotation
springMVC(一)：整体请求过程概述
 正则表达式30分钟入门教程
 Python基本语法_强制数据类型转换
 Python 正则表达式入门（初级篇）
python实现简单爬虫功能
 在python3.3后urllib2已经不能再用，只能用urllib.request来代替
 JSON

原文地址：https://www.cnblogs.com/arcsinw/p/9118710.html