zoukankan html css js c++ java

简书全站CrawlSpider爬取 mysql异步保存

# 简书网
# 数据保存在mysql中; 将selenium+chromedriver集成到scrapy; 整个网站数据爬取
#  抓取ajax数据

#爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import ArticleItem

class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/'] # 从首页开始爬去

    rules = (
        # 详情页里面下面推荐的文章的href直接就是/p/.......
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'),
             callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        # print(response.text)
        title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()
        # print(title)
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        # print(avatar)
        author = response.xpath("//span[@class='name']/a/text()").get()
        # print(author)
        pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
        # print(pub_time)

        # url正常情况下里面只有一个?
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        # print(article_id)

        # 把html标签一起趴下来, 方便以后展示
        content = response.xpath("//div[@class='show-content']").get()
        # print(content)
        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            origin_url=response.url,
            article_id=article_id,
            content=content
        )
        yield item

# item文件
import scrapy

class ArticleItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time = scrapy.Field()
    
    
# pipeline文件  保存在mysql中
import pymysql
from twisted.enterprise import adbapi       # 专门做数据库处理的模块
from pymysql import cursors

class JianshuSpiderPipeline(object):
    def __init__(self):
        dbparams={
            'host':'127.0.0.1',
            'port':3306,
            'user':'root',
            'password':'',
            'database':'jianshu',
            'charset':'utf8'
        }
        self.conn = pymysql.connect(**dbparams)
        # **dbparams 相当于把 host='127.0.0.1' 写在了括号里

        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                      item['pub_time'],item['origin_url'],item['article_id']))
        self.conn.commit() # 这个是同步进行的 比较慢
        return item

    @property
    def sql(self):
        if not self._sql: # 如果没有 执行
            self._sql = '''
            insert into article2(id,title,content,author,avatar,pub_time,
            origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
            '''
            return self._sql
        else:
            return self._sql

# 优化上面的pipeline文件,  实现异步保存
# 使用twisted 提供的数据库连接池 ConnectionPool,把插入数据的动作变成异步的 (面试可以说)

# 上面的存储是同步 比较慢, 现在优化成异步
class JianshuTwistedPipeline(object):
    def __init__(self):
        # 创建连接池
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': '',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass':cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql: # 如果没有 执行
            self._sql = '''
            insert into article2(id,title,content,author,avatar,pub_time,
            origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
            '''
            return self._sql
        else:
            return self._sql

    def process_item(self,item,spider):
        # runInteraction执行异步的
        defer = self.dbpool.runInteraction(self.insert_item,item)
        defer.addErrback(self.handle_error,item,spider)

    def insert_item(self,cursor,item): # 插入数据库
        cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                      item['pub_time'],item['origin_url'],item['article_id']))

    def handle_error(self,error,item,spider):
        print('='*20)
        print("error:",error)
        print('='*20)

# 把settings中的pipeline文件改一下
ITEM_PIPELINES = {
   # 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
   'jianshu_spider.pipelines.JianshuTwistedPipeline': 300, # 异步保存数据
}

# 优化动态数据     处理ajax加载进来的数据
# selenium+chromdriver 处理


# 爬虫文件  把阅读量,点赞数,文章字数,标题分类,评论数 字段获取,保存到item中
    def parse_detail(self, response):
        # print(response.text)
        title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()
        print(title)
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        # print(avatar)
        author = response.xpath("//span[@class='name']/a/text()").get()
        # print(author)
        pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
        # print(pub_time)

        # url正常情况下里面只有一个?
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        # print(article_id)

        # 把html标签一起趴下来, 方便以后展示
        content = response.xpath("//div[@class='show-content']").get()
        # print(content)

        # 动态获取下面的数据
        word_count = response.xpath("//span[@class='wordage']/text()").get().split(" ")[-1]
        read_count = response.xpath("//span[@class='views-count']/text()").get().split(" ")[-1]
        comment_count = response.xpath("//span[@class='comments-count']/text()").get().split(" ")[-1]
        like_count = response.xpath("//span[@class='likes-count']/text()").get().split(" ")[-1]
        subject = response.xpath("//div[@class='include-collection']/a/div/text()").getall()
        # subject 获取的时候一个列表  存到mysql的时候不支持, 需要把列表转成字符串
        subject = ",".join(subject)

        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            origin_url=response.url,
            article_id=article_id,
            content=content,
            
            word_count=word_count,
            read_count=read_count,
            comment_count=comment_count,
            like_count=like_count,
            subject=subject,
        )
        yield item



# 管道文件
# 上面的存储是同步 比较慢, 现在优化成异步
class JianshuTwistedPipeline(object):
    def __init__(self):
        # 创建连接池
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': '',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass':cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql: # 如果没有 执行
            self._sql = '''
            insert into article2(id,title,content,author,avatar,pub_time,
            origin_url,article_id,read_count, word_count, like_count, comment_count,subject)
             values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            '''
            #

            return self._sql
        else:
            return self._sql

    def process_item(self,item,spider):
        # runInteraction执行异步的
        defer = self.dbpool.runInteraction(self.insert_item,item)
        defer.addErrback(self.handle_error,item,spider)

    def insert_item(self,cursor,item): # 插入数据库
        cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                      item['pub_time'],item['origin_url'],item['article_id'],
                                 item['read_count'],item['word_count'],item['like_count'],item['comment_count'],item['subject']))

    def handle_error(self,error,item,spider):
        print('='*20+'error'+'='*20)
        print("error:",error)
        print('='*20+'error'+'='*20)

查看全文

相关阅读:
Windows Server 2012配置开机启动项
 Windows Server 2019 SSH Server
NOIP2017 senior A 模拟赛 7.7 T1 棋盘
 Noip 2015 senior 复赛 Day2 子串
 Noip 2015 senior复赛题解
 Noip 2014 senior Day2 解方程（equation）
Noip 2014 senior Day2 寻找道路（road）
Noip 2014 senior Day2 无线网络发射器选址（wireless）
Noip2014senior复赛飞扬的小鸟
 Noip 2014 senior 复赛联合权值（link）

原文地址：https://www.cnblogs.com/kenD/p/11123696.html