zoukankan      html  css  js  c++  java
  • python-scrapy-增量式

    movie.py

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from zlsPro.items import ZlsproItem
    from redis import Redis


    class MovieSpider(CrawlSpider):
    name = 'movie'
    start_urls = ['https://www.4567kan.com/index.php/vod/show/id/1.html']

    rules = (
    Rule(LinkExtractor(allow=r'/index.php/vod/show/id/1/page/d+.html'), callback='parse_item', follow=True),
    )

    coon = Redis(host='127.0.0.1',port=6379)

    def parse_item(self, response):
    li_list = response.xpath('//div[1]/div/div/div/div[2]/ul/li')
    for li in li_list:
    title = li.xpath('./div/div/h4/a/text()')[0].extract()
    href = 'https://www.4567kan.com' + li.xpath('./div/div/h4/a/@href')[0].extract()
    item = ZlsproItem()
    item['title'] = title
    item['href'] = href
    ex = self.coon.sadd('movie_url',href)
    if ex == 1:
    print('有新增')
    yield scrapy.Request(url=href,callback=self.parse_href,meta={'item': item})
    else:
    print('暂无新增')


    def parse_href(self,response):
    detail = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
    item = response.meta['item']
    item['detail'] = detail
    yield item

    settings.py

    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ROBOTSTXT_OBEY = False
    LOG_LEVEL = 'ERROR'
    # 使用scrapy-redis组件的去重队列
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    # 使用scrapy-redis组件自己的调度器
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # 是否允许暂停
    SCHEDULER_PERSIST = True

    # 指定管道
    ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 400
    }

    # 指定数据库
    REDIS_HOST = '127.0.0.1'
    REDIS_PORT = 6379

    items.py

    import scrapy

    class ZlsproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    href = scrapy.Field()
    detail = scrapy.Field()

    运行项目 scrapy crawl movie
  • 相关阅读:
    智联招聘
    我的Linux以及软件配置(长期更新)
    关于Git的笔记
    PHP和HTML表单
    web学习笔记——CSS整理(一)
    新开通博客园
    Thinphp模板替换
    __APP__
    大步前行
    centos 7 添加环境变量
  • 原文地址:https://www.cnblogs.com/shiyi525/p/14286167.html
Copyright © 2011-2022 走看看