zoukankan      html  css  js  c++  java
  • scrapy增量式爬虫

    命令:

    1.创建scrapy工程:scrapy startproject projectName
    2.创建爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com
      指令多了 "-t crawl",表示创建的爬虫文件是基于CrawlSpider这个类的,而不再是Spider这个基类。
    3.运行 scrapy crawl spider2

    spider.py

    用hashlib来制作哈希值来放在Redis中, 可以减少放在Redis中的为了校验是否存在的内容

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from redis import Redis
    from scrapy2.items import Scrapy2Item
    import hashlib
    
    
    class Spider2Spider(CrawlSpider):
        name = 'spider2'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.qiushibaike.com/text/']
    
        rules = (
            Rule(LinkExtractor(allow=r'/text/page/d+/'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            div_list = response.xpath('//div[@class="article block untagged mb15 typs_hot"]')
            conn = Redis(host='127.0.0.1',port=6379)
            for div in div_list:
                item = Scrapy2Item()
                item['content'] = div.xpath('.//div[@class="content"]/span//text()').extract()
                item['content'] = ''.join(item['content'])
                item['author'] = div.xpath('./div/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
                source = item['author']+item['content']
                hashValue = hashlib.sha256(source.encode()).hexdigest()
    
                ex = conn.sadd('qiubai_hash', hashValue)
                if ex == 1:
                    yield item
                else:
                    print('已爬取')

    spider.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from redis import Redis
    from scrapy2.items import Scrapy2Item
    
    
    class Spider2Spider(CrawlSpider):
        name = 'spider2'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html']
    
        rules = (
            Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/d+.html'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            conn = Redis(host='127.0.0.1', port=6379)
            detail_url_list = ['https://www.4567tv.tv' + el for el in  response.xpath(
                '//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract()]
            for url in detail_url_list:
                # ex == 1:set中没有存储url
                ex = conn.sadd('movies_url',url)
                if ex == 1:
                    yield scrapy.Request(url=url, callback=self.parse_detail)
                else:
                    print('已爬取过')
    
        def parse_detail(self,response):
            item = Scrapy2Item()
            item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
            item['actor'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first()
    
            yield item

    settings.py

    BOT_NAME = 'scrapy2'
    
    SPIDER_MODULES = ['scrapy2.spiders']
    NEWSPIDER_MODULE = 'scrapy2.spiders'
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'scrapy2 (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    CONCURRENT_REQUESTS = 32
    
    ITEM_PIPELINES = {
        'scrapy2.pipelines.Scrapy2Pipeline': 300
    }

    pipelines.py

    from redis import Redis
    class Scrapy2Pipeline(object):
        def __init__(self):
            self.conn = None
        def open_spider(self, spider):
            self.conn = Redis(host='127.0.0.1', port=6379)
    
        def process_item(self, item, spider):
            dic = {
                'name':item['name'],
                'actor':item['actor']
            }
            self.conn.lpush('qiubaiData',dic)
            print('爬取到一条数据,正在入库......')
            return item
  • 相关阅读:
    Android--->activity高级运用,保存前一个界面为完成的数据savedInstanceState。
    Android--->activity界面跳转,以及查看生命周期过程
    Android--->Intent
    Android--->Button按钮操作
    安卓EditText按钮
    DDS视图&Button控件
    Android之EditText控件
    Android之TextView控件的学习
    usb免驱动摄像头实验
    Flash硬件原理
  • 原文地址:https://www.cnblogs.com/NachoLau/p/10480597.html
Copyright © 2011-2022 走看看