# 安装redis服务器端 sudo apt-get install redis-server # 安装scrapy和scrapy-redis库 pip install scrapy pip install scrapy-redis
# 终端1 $ redis-cli # 终端2(在spider目录下打开,并切换到虚拟环境) $ scrapy runspider bludv.py # 终端1 $ lpush bludv:start_urls https://www.bludv.tv
# 终端1: # 删除当前数据库中的所有Key flushdb # 删除所有数据库中的key flushall //下面的命令指定数据序号为0,即默认数据库 redis-cli -n 0 keys "*" | xargs redis-cli -n 0 del
settings.py代码需要更改的:
# 启动scrapy_redis的调度器, 在 redis 数据库里分配请求 SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 启用去重功能 # 默认的scrapy-redis请求队列形式 SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" # 请求调度使用FIFO队列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue' # 可选的 按后进先出排序(LIFO) # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack' # 不清除Redis队列、这样可以暂停/恢复 爬取 # SCHEDULER_PERSIST = True # REDIS_URL = 'redis://192.168.2.223:6379' # 腾讯服务器 # # Specify the host and port to use when connecting to Redis (optional). REDIS_HOST = 'localhost' REDIS_PORT = 6379 # COOKIES_ENABLED = False # 禁用 cookies # DOWNLOAD_DELAY = 4 # 设置下载延迟 # DOWNLOAD_TIMEOUT = 10 # 请求超时 # 重新请求 RETRY_ENABLED = True # 重试次数 RETRY_TIMES = 8 AUTOTHROTTLE_ENABLED = True # 防止遗漏 # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'BludvSpider.pipelines.BludvspiderPipeline': 300, 'BludvSpider.pipelines.MongodbWithPymongoPipeline': 301, # 使用 PyMongo 写入 MongoDB 'scrapy_redis.pipelines.RedisPipeline': 100, # scrapy-redis 分布式 }
bludv.py需要更改的:
class BludvSpider(scrapy.Spider): # 改为 from scrapy_redis.spiders import RedisSpider class BludvSpider(RedisSpider):
# start_urls = [ # # 'https://www.bludv.tv/category/series/' # # 'https://www.bludv.tv/category/series/page/56/' # # 'https://www.bludv.tv/o-protetor-2-torrent-2018-dublado-dual-audio-legendado-bluray-720p-e-1080p-download/' # "https://www.bludv.tv" # ] 改为 redis_key = "bludv:start_urls"