zoukankan      html  css  js  c++  java
  • 21天打造分布式爬虫-简书整站爬取(十)

    10.1.简书整站爬虫

    创建项目

    scrapy startproject jianshu
    
    scrapy genspider -t crawl jianshu_spider "jianshu.com"

    jianshu_spider.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from jianshu.items import JianshuItem
    
    class JianshuSpiderSpider(CrawlSpider):
        name = 'jianshu_spider'
        allowed_domains = ['jianshu.com']
        start_urls = ['http://jianshu.com/']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z][12].*'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            title = response.xpath("//h1[@class='title']/text()").get()
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            author = response.xpath("//span[@class='name']/a/text()").get()
            pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
            #获取文章id
            url = response.url
            url1 = url.split("?")[0]
            article_id = url1.split("/")[-1]
            #文章内容,包括标签,而不是存文本内容
            content = response.xpath("//div[@class='show-content']").get()
            word_count = response.xpath("//span[@class='wordage']/text()").get()
            comment_count = response.xpath("//span[@class='comments-count']/text()").get()
            read_count = response.xpath("//span[@class='views-count']/text()").get()
            like_count = response.xpath("//span[@class='likes-count']/text()").get()
            subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())
    
            item = JianshuItem(
                title=title,
                avatar=avatar,
                pub_time=pub_time,
                author=author,
                origin_url=response.url,
                content=content,
                article_id=article_id,
                subjects=subjects,
                word_count=word_count,
                comment_count=comment_count,
                like_count=like_count,
                read_count=read_count
            )
            yield item

    items.py

    import scrapy
    
    class JianshuItem(scrapy.Item):
        title = scrapy.Field()
        content = scrapy.Field()
        article_id = scrapy.Field()
        origin_url = scrapy.Field()
        author = scrapy.Field()
        avatar = scrapy.Field()
        pub_time = scrapy.Field()
        read_count = scrapy.Field()
        like_count = scrapy.Field()
        word_count = scrapy.Field()
        subjects = scrapy.Field()
        comment_count = scrapy.Field()

    pipelines.py

    # -*- coding: utf-8 -*-
    # import pymysql
    #
    # class JianshuPipeline(object):
    #     def __init__(self):
    #         dbparams = {
    #             'host': '127.0.0.1',
    #             'port': 3306,
    #             'user': 'root',
    #             'password': '123456',
    #             'database': 'jianshu',
    #             'charset': 'utf8'
    #         }
    #         self.conn = pymysql.connect(**dbparams)
    #         self.cursor = self.conn.cursor()
    #         self._sql = None
    #
    #     def process_item(self, item, spider):
    #         self.cursor.execute(self.sql, (item['title'], item['content'],
    #                                        item['author'], item['avatar'], item['pub_time'], item['article_id'],
    #                                        item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count']))
    #         self.conn.commit()
    #         return item
    #
    #     @property
    #     def sql(self):
    #         if not self._sql:
    #             self._sql = """
    #                 insert into article(id,title,content,author,avatar,pub_time,
    #                 article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #             """
    #             return self._sql
    #         return self._sql
    
    
    # 采用twisted异步保存到mysql
    
    import pymysql
    from twisted.enterprise import adbapi
    from pymysql import cursors
    
    
    class JianshuTwistedPipeline(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '123456',
                'database': 'jianshu',
                'charset': 'utf8',
                'cursorclass': cursors.DictCursor
            }
            self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                                insert into article(id,title,content,author,avatar,pub_time,
                                article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                            """
                return self._sql
            return self._sql
    
        def process_item(self, item, spider):
            defer = self.dbpool.runInteraction(self.insert_item, item)
            defer.addErrback(self.handle_error, item, spider)
    
        def insert_item(self, cursor, item):
            cursor.execute(self.sql, (item['title'], item['content'],
                                           item['author'], item['avatar'], item['pub_time'], item['article_id'],
                                           item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count']))
    
        def handle_error(self, error, item, spider):
            # print(error)
            pass

    middlewares.py

    # -*- coding: utf-8 -*-
    
    from selenium import webdriver
    import time
    from scrapy.http.response.html import HtmlResponse
    
    
    class SeleniumDownloadMiddleware(object):
        def __init__(self):
            self.driver = webdriver.Chrome()
    
        def process_request(self,request,spider):
            self.driver.get(request.url)
            time.sleep(1)
            try:
                while True:
                    showmore = self.driver.find_element_by_class_name('show-more')
                    showmore.click()
                    time.sleep(0.5)
                    if not showmore:
                        break
            except:
                pass
            source = self.driver.page_source
            response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')
            return response

    settings.py

    ROBOTSTXT_OBEY = False
    
    DOWNLOAD_DELAY = 1
    
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    
    DOWNLOADER_MIDDLEWARES = {
       'jianshu.middlewares.SeleniumDownloadMiddleware': 543,
    }
    
    ITEM_PIPELINES = {
       # 'jianshu.pipelines.JianshuPipeline': 300,
       'jianshu.pipelines.JianshuTwistedPipeline': 1,
    }

    start.py

    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl jianshu_spider".split())
  • 相关阅读:
    前端工程师入门的阶段
    学习能力与思考能力
    翻译 前端面试题目
    css规范
    html规范
    javascript中apply、call和bind的区别
    Javascript高级程序设计学习笔记一
    css学习笔记四
    [LC] 23. Merge k Sorted Lists
    [LC] 234. Palindrome Linked List
  • 原文地址:https://www.cnblogs.com/derek1184405959/p/9431420.html
Copyright © 2011-2022 走看看