zoukankan      html  css  js  c++  java
  • 简书全站CrawlSpider爬取 mysql异步保存

    # 简书网
    # 数据保存在mysql中; 将selenium+chromedriver集成到scrapy; 整个网站数据爬取
    #  抓取ajax数据
    
    #爬虫文件
    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from jianshu_spider.items import ArticleItem
    
    class JsSpider(CrawlSpider):
        name = 'js'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/'] # 从首页开始爬去
    
        rules = (
            # 详情页里面下面推荐的文章的href直接就是/p/.......
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'),
                 callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            # print(response.text)
            title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()
            # print(title)
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            # print(avatar)
            author = response.xpath("//span[@class='name']/a/text()").get()
            # print(author)
            pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
            # print(pub_time)
    
            # url正常情况下里面只有一个?
            url = response.url
            url1 = url.split("?")[0]
            article_id = url1.split("/")[-1]
            # print(article_id)
    
            # 把html标签一起趴下来, 方便以后展示
            content = response.xpath("//div[@class='show-content']").get()
            # print(content)
            item = ArticleItem(
                title=title,
                avatar=avatar,
                author=author,
                pub_time=pub_time,
                origin_url=response.url,
                article_id=article_id,
                content=content
            )
            yield item
    
    # item文件
    import scrapy
    
    class ArticleItem(scrapy.Item):
        # define the fields for your item here like:
        title = scrapy.Field()
        content = scrapy.Field()
        article_id = scrapy.Field()
        origin_url = scrapy.Field()
        author = scrapy.Field()
        avatar = scrapy.Field()
        pub_time = scrapy.Field()
        
        
    # pipeline文件  保存在mysql中
    import pymysql
    from twisted.enterprise import adbapi       # 专门做数据库处理的模块
    from pymysql import cursors
    
    class JianshuSpiderPipeline(object):
        def __init__(self):
            dbparams={
                'host':'127.0.0.1',
                'port':3306,
                'user':'root',
                'password':'',
                'database':'jianshu',
                'charset':'utf8'
            }
            self.conn = pymysql.connect(**dbparams)
            # **dbparams 相当于把 host='127.0.0.1' 写在了括号里
    
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                          item['pub_time'],item['origin_url'],item['article_id']))
            self.conn.commit() # 这个是同步进行的 比较慢
            return item
    
        @property
        def sql(self):
            if not self._sql: # 如果没有 执行
                self._sql = '''
                insert into article2(id,title,content,author,avatar,pub_time,
                origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
                '''
                return self._sql
            else:
                return self._sql
    # 优化上面的pipeline文件,  实现异步保存
    # 使用twisted 提供的数据库连接池 ConnectionPool,把插入数据的动作变成异步的 (面试可以说)
    
    # 上面的存储是同步 比较慢, 现在优化成异步
    class JianshuTwistedPipeline(object):
        def __init__(self):
            # 创建连接池
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'jianshu',
                'charset': 'utf8',
                'cursorclass':cursors.DictCursor
            }
            self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql: # 如果没有 执行
                self._sql = '''
                insert into article2(id,title,content,author,avatar,pub_time,
                origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
                '''
                return self._sql
            else:
                return self._sql
    
        def process_item(self,item,spider):
            # runInteraction执行异步的
            defer = self.dbpool.runInteraction(self.insert_item,item)
            defer.addErrback(self.handle_error,item,spider)
    
        def insert_item(self,cursor,item): # 插入数据库
            cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                          item['pub_time'],item['origin_url'],item['article_id']))
    
        def handle_error(self,error,item,spider):
            print('='*20)
            print("error:",error)
            print('='*20)
    
    # 把settings中的pipeline文件改一下
    ITEM_PIPELINES = {
       # 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
       'jianshu_spider.pipelines.JianshuTwistedPipeline': 300, # 异步保存数据
    }
    # 优化动态数据     处理ajax加载进来的数据
    # selenium+chromdriver 处理
    
    
    # 爬虫文件  把阅读量,点赞数,文章字数,标题分类,评论数 字段获取,保存到item中
        def parse_detail(self, response):
            # print(response.text)
            title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()
            print(title)
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            # print(avatar)
            author = response.xpath("//span[@class='name']/a/text()").get()
            # print(author)
            pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
            # print(pub_time)
    
            # url正常情况下里面只有一个?
            url = response.url
            url1 = url.split("?")[0]
            article_id = url1.split("/")[-1]
            # print(article_id)
    
            # 把html标签一起趴下来, 方便以后展示
            content = response.xpath("//div[@class='show-content']").get()
            # print(content)
    
            # 动态获取下面的数据
            word_count = response.xpath("//span[@class='wordage']/text()").get().split(" ")[-1]
            read_count = response.xpath("//span[@class='views-count']/text()").get().split(" ")[-1]
            comment_count = response.xpath("//span[@class='comments-count']/text()").get().split(" ")[-1]
            like_count = response.xpath("//span[@class='likes-count']/text()").get().split(" ")[-1]
            subject = response.xpath("//div[@class='include-collection']/a/div/text()").getall()
            # subject 获取的时候一个列表  存到mysql的时候不支持, 需要把列表转成字符串
            subject = ",".join(subject)
    
            item = ArticleItem(
                title=title,
                avatar=avatar,
                author=author,
                pub_time=pub_time,
                origin_url=response.url,
                article_id=article_id,
                content=content,
                
                word_count=word_count,
                read_count=read_count,
                comment_count=comment_count,
                like_count=like_count,
                subject=subject,
            )
            yield item
    
    
    
    # 管道文件
    # 上面的存储是同步 比较慢, 现在优化成异步
    class JianshuTwistedPipeline(object):
        def __init__(self):
            # 创建连接池
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'jianshu',
                'charset': 'utf8',
                'cursorclass':cursors.DictCursor
            }
            self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql: # 如果没有 执行
                self._sql = '''
                insert into article2(id,title,content,author,avatar,pub_time,
                origin_url,article_id,read_count, word_count, like_count, comment_count,subject)
                 values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                '''
                #
    
                return self._sql
            else:
                return self._sql
    
        def process_item(self,item,spider):
            # runInteraction执行异步的
            defer = self.dbpool.runInteraction(self.insert_item,item)
            defer.addErrback(self.handle_error,item,spider)
    
        def insert_item(self,cursor,item): # 插入数据库
            cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                          item['pub_time'],item['origin_url'],item['article_id'],
                                     item['read_count'],item['word_count'],item['like_count'],item['comment_count'],item['subject']))
    
        def handle_error(self,error,item,spider):
            print('='*20+'error'+'='*20)
            print("error:",error)
            print('='*20+'error'+'='*20)
  • 相关阅读:
    Windows Server 2012配置开机启动项
    Windows Server 2019 SSH Server
    NOIP2017 senior A 模拟赛 7.7 T1 棋盘
    Noip 2015 senior 复赛 Day2 子串
    Noip 2015 senior复赛 题解
    Noip 2014 senior Day2 解方程(equation)
    Noip 2014 senior Day2 寻找道路(road)
    Noip 2014 senior Day2 无线网络发射器选址(wireless)
    Noip2014senior复赛 飞扬的小鸟
    Noip 2014 senior 复赛 联合权值(link)
  • 原文地址:https://www.cnblogs.com/kenD/p/11123696.html
Copyright © 2011-2022 走看看