zoukankan      html  css  js  c++  java
  • scrapy 爬取斗罗大陆漫画

    # -*- coding: utf-8 -*-
    import scrapy
    import json
    import os
    import urllib
    import time
    
    from scrapy.http import Request
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import (
        TimeoutException,
        WebDriverException,
        NoSuchElementException,
        StaleElementReferenceException
    )
    
    
    def gen_browser(driver_path):
        '''实例化一个driver'''
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-gpu')
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('disable-infobars')
        options.add_argument("--disable-plugins-discovery")
        user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
        options.add_argument('user-agent="{0}"'.format(user_agent))
        # ############### 专业造假 ***************************
    
        def send(driver, cmd, params={}):
            '''
            向调试工具发送指令
            from: https://stackoverflow.com/questions/47297877/to-set-mutationobserver-how-to-inject-javascript-before-page-loading-using-sele/47298910#47298910
            '''
            resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
            url = driver.command_executor._url + resource
            body = json.dumps({'cmd': cmd, 'params': params})
            response = driver.command_executor._request('POST', url, body)
            if response['status']:
                raise Exception(response.get('value'))
            return response.get('value')
    
        def add_script(driver, script):
            '''在页面加载前执行js'''
            send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
        # 给 webdriver.Chrome 添加一个名为 add_script 的方法
        webdriver.Chrome.add_script = add_script  # 这里(webdriver.Chrome)可能需要改,当调用不同的驱动时
        # *************** 专业造假 ###################
        browser = webdriver.Chrome(
            executable_path=driver_path,
            chrome_options=options
        )
        # ################## 辅助调试 *********************
        existed = {
            'executor_url': browser.command_executor._url,  # 浏览器可被远程连接调用的地址
            'session_id': browser.session_id  # 浏览器会话ID
        }
        print(existed)
        # ********************* 辅助调试 ##################
        # ############### 专业造假 ***************************
        browser.add_script("""
        Object.defineProperty(navigator, 'webdriver', {
            get: () => false,
        });
        window.navigator.chrome = {
            runtime: {},
        };
        Object.defineProperty(navigator, 'languages', {
            get: () => ['zh-CN', 'zh']
        });
        Object.defineProperty(navigator, 'plugins', {
            get: () => [0, 1, 2],
        });
        """)
        # *************** 专业造假 ###################
        return browser
    
    
    class XuexingSpider(scrapy.Spider):
        name = 'xuexing'
        allowed_domains = ['www.manhuatai.com']
        start_urls = ['https://www.mh1234.com/wap/comic/9683/262424.html']
    
        def parse(self, response):
            driver_path = self.settings.get('DRIVER_PATH')
            # import ipdb; ipdb.set_trace()
            browser = gen_browser(driver_path)
            # 获取当前章节名
            next_url = response.url
            i = 0
            while bool(next_url):
                i += 1
                print(str(i).center(60, '*'))
                next_url = self.get_item(browser, next_url)
    
        def get_item(self, browser, url=None):
            if url is not None:
                browser.get(url)  # 打开页面
            van1 = browser.find_elements_by_xpath('//a[@class="BarTit"]')
            van = van1[0].text.split('(')[0].strip()
            if '/' in van:
                van = '-'.join(van.split('/'))
            # import ipdb; ipdb.set_trace()
            if not os.path.exists('斗罗大陆'):
                os.mkdir('斗罗大陆')
            if not os.path.exists(van):
                os.mkdir(r'斗罗大陆/{0}'.format(van))
            m = 0
            _url = browser.find_element_by_xpath('//*[@id="qTcms_pic"]')
            img_url = _url.get_attribute('src')
            # 保存图片到指定路径  
            if img_url != None:
                m += 1
                #保存图片数据
                data = urllib.request.urlopen(img_url).read()
                f = open('斗罗大陆/{0}/{1}.jpg'.format(van, m), 'wb')
                f.write(data)
                f.close()
            ye = int(browser.find_element_by_xpath('//*[@id="k_total"]').text)
            for yei in range(1, ye):
                time.sleep(0.5)
                browser.find_element_by_xpath('//*[@id="action"]/ul/li[3]/a').click()
                _url = browser.find_element_by_xpath('//*[@id="qTcms_pic"]')
                img_url = _url.get_attribute('src')
                # 保存图片到指定路径  
                if img_url != None:
                    m += 1
                    #保存图片数据
                    data = urllib.request.urlopen(img_url).read()
                    f = open('斗罗大陆/{0}/{1}.png'.format(van, m), 'wb')
                    f.write(data)
                    f.close()
    
            xia = browser.find_element_by_xpath('//*[@id="action"]/ul/li[4]/a').get_attribute('href')
            return xia

  • 相关阅读:
    服务器负载均衡的基本功能和实现原理
    二分查找
    TCP的运输连接管理
    linux常用命令
    XX公司在线笔试题编程题之一
    java对象转json格式
    Java多线程并发技术
    进程同步与通信
    单例模式的C++实现
    rsyncd启动脚本
  • 原文地址:https://www.cnblogs.com/sxqfuture/p/10256347.html
Copyright © 2011-2022 走看看