zoukankan      html  css  js  c++  java
  • 简书网站 爬取所有文章(同步方式保存数据库)

    import scrapy
    import re
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from jianshu.items import JianshuItem
    
    
    class JsSpider(CrawlSpider):
        name = 'js'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/']
    
        rules = (
            # 匹配文章链接
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            title = response.xpath('//h1[@class="title"]/text()').get()
            avatar = response.xpath('//a[@class="avatar"]/img/@src').get()
            author = response.xpath('//span[@class="name"]//text()').get()
            content = response.xpath('//div[@class="show-content"]').get()
            pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace('*', '')
            read_count = response.xpath('//span[@class="views-count"]/text()').get()
            comment_count = response.xpath('//span[@class="comments-count"]/text()').get()
            like_count = response.xpath('//span[@class="likes-count"]/text()').get()
            rewards_count = response.xpath('//span[@class="wordage"]/text()').get()
            print('-------------', rewards_count)
            # 获取数字
            read_count = re.findall('d+', read_count)[0] if re.findall('d+', read_count) else None
            comment_count = re.findall('d+', comment_count)[0] if re.findall('d+', comment_count) else None
            like_count = re.findall('d+', like_count)[0] if re.findall('d+', like_count) else None
            rewards_count = re.findall('d+', rewards_count)[0] if re.findall('d+', rewards_count) else None
            item = JianshuItem(
                title=title,
                avatar=avatar,
                author=author,
                content=content,
                pub_time=pub_time,
                read_count=read_count,
                comment_count=comment_count,
                like_count=like_count,
                rewards_count=rewards_count
            )
            yield item

    items:

    import scrapy
    
    
    class JianshuItem(scrapy.Item):
        title = scrapy.Field()
        avatar = scrapy.Field()
        author = scrapy.Field()
        content = scrapy.Field()
        pub_time = scrapy.Field()
        read_count = scrapy.Field()
        comment_count = scrapy.Field()
        like_count = scrapy.Field()
        rewards_count = scrapy.Field()

    pipline:

    class JianShuPipeline(object):
        def __init__(self):
            db_params = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'jianshu',
                'charset': 'utf8'
            }
            self.conn = pymysql.connect(**db_params)
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql,
                                (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'],
                                 item['like_count'], item['read_count'], item['comment_count'], item['rewards_count']))
            self.conn.commit()
            return item
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
               INSERT INTO article(id, title, content, author, avatar, pub_time, like_count, read_count, comment_count, 
               rewards_count) VALUES (null, %s, %s, %s, %s, %s, %s, %s, %s, %s);
               """
                return self._sql
            return self._sql
  • 相关阅读:
    line-height 对a标签在有些浏览器中不支持
    git 学习手记
    nth-child 与 nth-of-type区别
    less学习笔记(持续更新)
    如何让textarea的placeholder中的文字换行
    livereload 不刷新页面 保存文件后 浏览器自动重新刷新
    优雅降级元(CSS JS)
    vw vh vm CSS长度单位
    NODE_PATH的设置
    EF学习笔记(十二):EF高级应用场景
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/10338756.html
Copyright © 2011-2022 走看看