zoukankan      html  css  js  c++  java
  • cnblogs 博客爬取 + scrapy + 持久化 + 分布式

    cnblogs_spider.py

    普通 scrapy

    # -*- coding: utf-8 -*-
    import scrapy
    
    from ..items import TttItem
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'  # 爬虫名字
        start_urls = ['https://www.cnblogs.com']
    
        def parse(self, response):
            div_list = response.xpath('//div[@class="post_item_body"]')
            for div in div_list:
                title = div.xpath('./h3/a/text()').extract_first()
                url = div.xpath('./h3/a/@href').extract_first()
                outline = div.css('.post_item_summary::text').extract()[-1]
                author = div.xpath('./div[@class="post_item_foot"]/a/text()').extract_first()
    
    
                item = TttItem()
                item['title'] = title
                item['outline'] = outline
                item['author'] = author
                item['url'] = url
                yield scrapy.Request(url, callback=self.get_detail, meta={'item': item})
    
            beforeurl = response.url
            print(beforeurl)
    
            # 获取最后一个 a 标签
            next_url = response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
            print('next_url', next_url)
    
            yield scrapy.Request(self.start_urls[0] + next_url, callback=self.parse)
    
        # 获取文章详情
        def get_detail(self, response):
            content = response.xpath('//div[@id="cnblogs_post_body"]').extract_first()
            if not content:
                content=response.css('content').extract_first()
    
            item = response.meta.get('item')
            item['content'] = content
            yield item
    
    

    piplines.py

    import pymysql
    
    class CnblogsSaveMysqlPipline(object):
        def open_spider(self, spider):
            self.conn = pymysql.connect(user='root', password='123123', db='cnblogs')
    
        def close_spider(self, spider):
            self.conn.close()
    
        def process_item(self, item, spider):
            cursor = self.conn.cursor()
            sql = '''insert into cnb (title, outline, author, url, content) values (%s,%s,%s,%s,%s)'''
            cursor.execute(sql, args=(item['title'], item['outline'], item['author'], item['url'], item['content']))
            self.conn.commit()
    

    分布式爬取

    cnblogs_spider.py

    # -*- coding: utf-8 -*-
    import scrapy
    
    from ..items import TttItem
    from scrapy.http import Request
    from scrapy_redis.spiders import RedisSpider
    
    class ChoutiSpider(RedisSpider):
        name = 'chouti'  # 爬虫名字
        allowed_domains = ['www.cnblogs.com']
        redis_key = 'myspider:start_urls'
    
    
        def parse(self, response):
            div_list = response.xpath('//div[@class="post_item_body"]')
            for div in div_list:
                title = div.xpath('./h3/a/text()').extract_first()
                url = div.xpath('./h3/a/@href').extract_first()
                outline = div.css('.post_item_summary::text').extract()[-1]
                author = div.xpath('./div[@class="post_item_foot"]/a/text()').extract_first()
    
                item = TttItem()
                item['title'] = title
                item['outline'] = outline
                item['author'] = author
                item['url'] = url
                yield Request(url, callback=self.get_detail, meta={'item': item})
    
            beforeurl = response.url
            print(beforeurl)
    
            # 获取最后一个 a 标签
            next = response.css('div.pager a:last-child::attr(href)').extract_first()
            # print('https://www.cnblogs.com/'+next)
            print('----爬取下一页地址', next)
            yield Request('https://www.cnblogs.com/' + next)
    
        def get_detail(self, response):
            content = response.xpath('//div[@id="cnblogs_post_body"]').extract_first()
            if not content:
                content=response.css('content').extract_first()
    
            item = response.meta.get('item')
            item['content'] = content
            yield item
    

    settings.py

    # Enables scheduling storing requests queue in redis.
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    
    # Ensure all spiders share same duplicates filter through redis.
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    
    
    REDIS_PARAMS = {'password':'redis123'}
    
  • 相关阅读:
    HANDLER进行堆叠注入
    CDUT第一届信安大挑战Re-wp
    Nu1LBook第一章wp
    Linux和VMWare
    [MRCTF]Xor
    IDA 调整栈帧 (411A04:positive sp value has been found)
    [BUU] 简单注册器
    2020年“安洵杯”四川省大学生信息安全技术大赛 部分WRITEUP
    关于我的pip不听话,总是说【Fatal error in launcher: Unable to create process using '"'】这件事
    C语言的PELode编写记录
  • 原文地址:https://www.cnblogs.com/kai-/p/12681708.html
Copyright © 2011-2022 走看看