zoukankan html css js c++ java

python-scrapy-分布式爬取

fenbushi.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from FenbushiProject.items import FenbushiprojectItem


class FenbishiSpider(RedisCrawlSpider):
    name = 'fenbishi'
    # start_urls = ['https://www.1905.com/vod/list/n_1_t_1/o3.html?fr=vodhome_js_lx']
    redis_key = 'jianliQuene'  #调度器队列的名称
    link = LinkExtractor(allow=r'https://www.1905.com/vod/list/n_1_t_1/o3pd+.html')
    rules = (
        Rule(link, callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        divs = response.xpath('//*[@id="content"]/section[4]/div')
        for div in divs:
            # href = div.xpath('./a/@href')[0].extract()
            title = div.xpath('./a/@title')[0].extract()
            item = FenbushiprojectItem()
            # item["href"] = href
            item["title"] = title
            print(title)
            yield item
items.py

import scrapy

class FenbushiprojectItem(scrapy.Item):

    title = scrapy.Field()

settings.py

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
# 使用scrapy-redis组件的去重队列
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允许暂停
SCHEDULER_PERSIST = True

# 指定管道
ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 400
}

# 指定数据库
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379

运行项目
配置redis  
打开 redis.windows.conf
56行：#bind 127.0.0.1
75行：protected-mode no

运行redis

运行成功效果图

执行项目

终端执行代码： scrapy crawl fenbushi

cmd执行代码：切换到redis目录

输入代码： redis-cli.exe lpush jianliQuene https://www.1905.com/vod/list/n_1_t_1/o3.html?fr=vodhome_js_lx

最后可以下载RedisDesktopManager查看数据

查看全文

相关阅读:
一行转多行及多行转一行的 hive语句
 sparkSQL、dataframe
特征工程
 python相关
 pyspark dataframe 格式数据输入做逻辑回归
 hive sql 随机抽样
 pyspark 逻辑回归程序
 3.27模拟赛
 luogu P3217 [HNOI2011]数矩形
 bzoj 4403 序列统计

原文地址：https://www.cnblogs.com/shiyi525/p/14285909.html