zoukankan      html  css  js  c++  java
  • python使用selenium爬百度文库ppt并生成pdf

    详细的讲解我是写在另外一个网址:https://www.yuque.com/docs/share/aacfa45c-22c5-4ef6-be97-cd6849002274

    有点尬尴,所以就.....

     在这里直接放下另外一个例子(《数学模型答案》)的代码

    from selenium import  webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.common.action_chains import ActionChains
    import time
    import re
    import requests
    
    class downloader:
        def __init__(self):
            self.browser =webdriver.Chrome()
            self.wait =wait = WebDriverWait(self.browser,3)
            self.i=0
            self.pattern =re.compile('.*?url("(.*?)")',re.S)
           
    
        def __call__(self,url):
            self.download(url)
            while True:
                for i in self.parse_link():
                    self.save(i)
                
                sub =self.browser.find_element_by_id('next-pageList-1')
                self.browser.execute_script("arguments[0].scrollIntoViewIfNeeded(true);",sub)
                sub.click()
                
                    
            self.browser.quit()
            
    
        def download(self,url):
            self.browser.get(url)
            submit =self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="html-reader-go-more"]/div[2]/div[1]/span/span[1]')))
            self.browser.execute_script("arguments[0].scrollIntoViewIfNeeded(true);",submit)
            submit.click()
    
    
                        
        def parse_link(self):
            self.elem=self.wait.until(EC.presence_of_element_located((By.ID,'reader-container-inner-1')))
            for i in self.elem.find_elements_by_class_name('bd'):
                try:
                    self.browser.execute_script("arguments[0].scrollIntoViewIfNeeded(true);",i)
                    time.sleep(0.6)    
                    i =i.find_element_by_class_name('reader-pic-item')
                            
                    js=i.get_attribute('style')
    
                    href =self.pattern.findall(js)
                    yield href[0]
                except NoSuchElementException:
                    continue
    
        def save(self,link):
            html =requests.get(link).content
            with open('{}.png'.format(self.i),'wb') as f:
                f.write(html)
            self.i +=1
    
    
    D =downloader()
    D('https://wenku.baidu.com/view/d86fe3436c175f0e7dd13731')
  • 相关阅读:
    collections工具类 排序
    API text|lang
    异常处理
    extends继承
    接口
    static修饰符
    多态与find关键词
    Markdown语法
    Hexo | (五)Yilia主题优化
    Hexo | (四)多机同步更新博客
  • 原文地址:https://www.cnblogs.com/vvlj/p/9974555.html
Copyright © 2011-2022 走看看