zoukankan      html  css  js  c++  java
  • scrapy爬取简书整站文章

    在这里我们使用CrawlSpider爬虫模板, 通过其过滤规则进行抓取, 并将抓取后的结果存入mysql中,下面直接上代码:

    jianshu_spider.py

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from scrapy.linkextractors import LinkExtractor
     4 from scrapy.spiders import CrawlSpider, Rule
     5 from jianshu.items import JianshuItem
     6 import html
     7 
     8 
     9 class JianshuSpiderSpider(CrawlSpider):
    10     name = 'jianshu_spider'
    11     allowed_domains = ['jianshu.com']
    12     start_urls = ['http://jianshu.com/']
    13 
    14     rules = (
    15         Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_article', follow=True),
    16     )
    17 
    18     def parse_article(self, response):
    19         article_code = response.url.split("?")[0].split("/")[-1]
    20         title = response.xpath('//h1[@class="title"]/text()').get().strip()
    21         author = response.xpath('//div[contains(@class, "author")]/div[@class="info"]//span[@class="name"]/a/text()').get().strip()
    22         head_img = response.xpath('//div[contains(@class, "author")]/a[@class="avatar"]/img/@src').get()
    23         pub_time = response.xpath('//span[@class="publish-time"]/text()').get().strip().replace('*','')
    24         head_img_url = "http:{}".format(head_img)
    25         # 存储到数据库中,需要对‘/’转义
    26         # content = html.escape(response.xpath('//div[@class="show-content"]').get())
    27         content = response.xpath('//div[@class="show-content"]').get()
    28 
    29         yield JianshuItem(
    30             article_code = article_code,
    31             title = title,
    32             author = author,
    33             head_img_url = head_img_url,
    34             content = content,
    35             pub_time = pub_time,)

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class JianshuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        article_code = scrapy.Field()
        title = scrapy.Field()
        author = scrapy.Field()
        pub_time = scrapy.Field()
        head_img_url = scrapy.Field()
        content = scrapy.Field()

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    from jianshu import model
    
    class JianshuPipeline(object):
    
        def __init__(self):
            self.session = model.DBSession()
    
        def process_item(self, item, spider):
            # 这里的item属于字典类型
            article = model.Article(**item)
            try:
                self.session.add(article)
                self.session.commit()
            except Exception as e:
                print("="*100)
                print("INSERT ERROR!")
                self.session.rollback()
            return item
        def open_spider(self, spider):
            pass
    
        def close_spider(self, spider):
            self.session.close()

    model.py

    from sqlalchemy import create_engine
    from sqlalchemy.ext.declarative import declarative_base
    from sqlalchemy import String, Text, Time, Column, Integer, VARCHAR
    from sqlalchemy.orm import sessionmaker
    
    # 创建数据库链接接口
    engine = create_engine("mysql+pymysql://jianshu:jianshu@localhost:3306/jianshu?charset=utf8mb4", echo=False)
    
    # 声明映像, 即实际数据库表的基本准则的映射类
    # 其维持类和数据库表关系目录
    Base = declarative_base()
    
    class Article(Base):
    
        __tablename__ = "jianshu_article"
    
        id = Column(Integer, autoincrement=True, primary_key=True)
        article_code = Column(String(16), nullable=False)
        title = Column(Text)
        author = Column(String(16))
        pub_time = Column(Time)
        head_img_url = Column(VARCHAR(256))
        content = Column(Text)
    
    DBSession = sessionmaker(bind=engine)
    
    if __name__ == '__main__':
        Base.metadata.create_all(engine)
  • 相关阅读:
    [OI学习笔记]排列组合&二项式定理
    [OI学习笔记]拓补排序
    [OI学习笔记]DAG最短路的四种算法整理-floyd,Dijkstra,Bellman-Ford,SPFA
    [OI学习笔记]从蒟蒻的角度理解动态规划(DP)——从暴力搜索到动态规划
    [OI学习笔记]最小生成树之Kruskal算法
    [OI学习笔记]最小生成树之Prim算法
    [OI学习笔记]图的存储与遍历-邻接矩阵&邻接表
    将数组中第n个元素,放到第m个元素后面
    Unity3D中播放序列帧动画
    UVA1589 Xiangqi
  • 原文地址:https://www.cnblogs.com/kisun168/p/10906299.html
Copyright © 2011-2022 走看看