zoukankan      html  css  js  c++  java
  • 爬虫第六篇:scrapy框架爬取某书网整站爬虫爬取

    新建项目

    # 新建项目
    $ scrapy startproject jianshu
    # 进入到文件夹 $ cd jainshu
    # 新建spider文件 $ scrapy genspider
    -t crawl jianshu_spider jainshu.com

    items.py文件

    import scrapy
    
    
    class ArticleItem(scrapy.Item):
        title = scrapy.Field()
        content = scrapy.Field()
        article_id = scrapy.Field()
        origin_url = scrapy.Field()
        author = scrapy.Field()
        avatar = scrapy.Field()
        pub_time = scrapy.Field()

    jianshu_spider.py文件

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from jianshu.items import ArticleItem
    
    
    class JianshuSpiderSpider(CrawlSpider):
        name = 'jianshu_spider'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            title = response.xpath("//h1[@class='title']/text()").get()
            content = response.xpath("//div[@class='show-content-free']").get()
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            author = response.xpath("//div[@class='info']/span/a/text()").get()
            pub_time = response.xpath("//span[@class='publish-time']/text()").get()
            article_id = response.url.split("?")[0].split("/")[-1]
            origin_url = response.url
            item = ArticleItem(
                title=title,
                content=content,
                avatar=avatar,
                pub_time=pub_time,
                article_id=article_id,
                origin_url=origin_url,
                author=author
            )
            yield item

    同步的MySQL插入数据

    import pymysql
    
    
    class JianshuPipeline(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'user': 'root',
                'password': '123456',
                'database': 'jianshu',
                'port': 3306,
                'charset': 'utf8'
            }
            self.conn = pymysql.connect(**dbparams)
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], 
                                           item['pub_time'], item['origin_url'], item['article_id']))
            self.conn.commit()
            return item
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
                """
                return self._sql
            return self._sql

    异步的MySQL插入数据

    from twisted.enterprise import adbapi
    from pymysql import cursors
    class JianshuTwistedPipeline(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'user': 'root',
                'password': '123456',
                'database': 'jianshu',
                'port': 3306,
                'charset': 'utf8',
                'cursorclass': cursors.DictCursor
            }
            self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                    insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
                    """
                return self._sql
            return self._sql
    
        def process_item(self, item, spider):
            defer = self.dbpool.runInteraction(self.insert_item, item)
            defer.addErrback(self.handle_error, item, spider)
    
        def insert_item(self, cursor, item):
            cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], 
                                      item['pub_time'], item['origin_url'], item['article_id']))
    
        def handle_error(self, error, item, spider):
            print('=' * 10 + 'error' + '=' * 10)
            print(error)
            print('=' * 10 + 'error' + '=' * 10)

     

  • 相关阅读:
    (15)疯狂的程序员----《绝影》
    (14)嵌入式软件开发工程师技能要求总结
    (13)碎片化阅读只会让你变得越来越愚蠢
    (12)QT中搭建opencv开发环境
    (11)git服务器的安装和配置
    (10)python学习笔记一
    (3.3)狄泰软件学院C++课程学习剖析四
    (9)Linux下gdb调试学习
    (8)Linux(客户端)和Windows(服务端)下socket通信实例
    springMVC伪静态
  • 原文地址:https://www.cnblogs.com/leijing0607/p/8075324.html
Copyright © 2011-2022 走看看