zoukankan      html  css  js  c++  java
  • scrapy使用selenium

    myspider.py

    import scrapy
    from scrapy1.items import Scrapy1Item
    from selenium import webdriver
    
    
    '''
    在scrapy中使用selenium的编码流程:
        1.在spider的构造方法中创建一个浏览器对象(作为当前spider的一个属性)
        2.重写spider的一个方法closed(self,spider),在该方法中执行浏览器关闭的操作
        3.在下载中间件的process_response方法中,通过spider参数获取浏览器对象
        4.在中间件的process_response中定制基于浏览器自动化的操作代码(获取动态加载出来的页面源码数据)
        5.实例化一个响应对象,且将page_source返回的页面源码封装到该对象中
        6.返回该新的响应对象
    '''
    
    class TestSpider(scrapy.Spider):
        name = 'qiubai'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['http://war.163.com/']
        def __init__(self):
            self.bro = webdriver.Chrome(executable_path=r'D:Day爬虫存储chromedriver.exe')
    
        def parse(self, response):
            div_list = response.xpath('//div[@class="data_row news_article clearfix "]')
            for div in div_list:
                title = div.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first()
                print(title)
    
        def closed(self, spider):
            print('bro has been closed')
            self.bro.quit()

    middlewares.py

    from scrapy import signals
    from scrapy.http import HtmlResponse
    from time import sleep
    
    
    class WangyiproDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            print('即将返回一个新的响应对象!!!')
            #如何获取动态加载出来的数据
            bro = spider.bro
            bro.get(url=request.url)
            sleep(3)
            #包含了动态加载出来的新闻数据
            page_text = bro.page_source
            sleep(3)
            return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)

    在中间件判断是否需要使用selenium来爬取内容的版本

    from scrapy.http import HtmlResponse    
        #参数介绍:
        #拦截到响应对象(下载器传递给Spider的响应对象)
        #request:响应对象对应的请求对象
        #response:拦截到的响应对象
        #spider:爬虫文件中对应的爬虫类的实例
        def process_response(self, request, response, spider):
            #响应对象中存储页面数据的篡改
            if request.url in['http://news.163.com/domestic/','http://news.163.com/world/','http://news.163.com/air/','http://war.163.com/']:
                spider.bro.get(url=request.url)
                js = 'window.scrollTo(0,document.body.scrollHeight)'
                spider.bro.execute_script(js)
                time.sleep(2)  #一定要给与浏览器一定的缓冲加载数据的时间
                #页面数据就是包含了动态加载出来的新闻数据对应的页面数据
                page_text = spider.bro.page_source
                #篡改响应对象
                return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
            else:
                return response
  • 相关阅读:
    php 网络图片转base64
    uniapp微信小程序拒绝授权后,重新调起授权页
    uniapp获取用户信息 getuserinfo
    基于JSP和Mybatis框架技术的应用总结
    Java第16周作业集
    Java第15周作业集
    Java第14周作业集
    Java第13周作业集
    软件工程结课作业
    Java第13次作业--邮箱的正则表达式
  • 原文地址:https://www.cnblogs.com/NachoLau/p/10474256.html
Copyright © 2011-2022 走看看