zoukankan      html  css  js  c++  java
  • selenium 爬取详情页附件链接,并且下载

    code1

    import sys,os
    sys.path.append("/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1])+'/lib')
    from selenium import webdriver
    import time
    from selenium.webdriver.common.keys import Keys
    
    #coding=utf-8
    
    import requests
    import time
    import os
    
    
    def formatFloat(num):
        return '{:.2f}'.format(num)
    
    
    #下载文件
    def downloadFile(name, url):
        headers = {'Proxy-Connection':'keep-alive'}
        r = requests.get(url, stream=True, headers=headers)
        length = float(r.headers['content-length'])
        f = open(name, 'wb')
        count = 0
        count_tmp = 0
        time1 = time.time()
        for chunk in r.iter_content(chunk_size = 512):
            if chunk:
                f.write(chunk)
                count += len(chunk)
                if time.time() - time1 > 2:
                    p = count / length * 100
                    speed = (count - count_tmp) / 1024 / 1024 / 2
                    count_tmp = count
                    print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                    time1 = time.time()
        f.close()
        
    #文件保存目录
    file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files")
    if(not os.path.exists(file_dir)):
        os.mkdir(file_dir)
    
    
    def asleep(driver):
        driver.implicitly_wait(3.5)
        time.sleep(2) 
    
    
    '''
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('headless')
    driver = webdriver.Chrome(options=chrome_options)
    '''
    driver = webdriver.Chrome()
    
    
    asleep(driver)
    driver.get("http://www.baidu.com/#/login")
    driver.maximize_window()
    driver.find_element_by_xpath(".//div[@class='login-container-r']/div[2]/input").send_keys("abc")
    driver.find_element_by_xpath(".//div[@class='login-container-r']/div[3]/input").send_keys("icloudeep123")
    driver.find_element_by_xpath(".//div[@class='login-container-r']/div[4]/input").send_keys("12345")
    
    
    driver.find_element_by_xpath(".//div[@class='login-container-r']/div[6]").click()
    
    
    asleep(driver)
    
    
    #下载失败的合同
    error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt")
    #保存未下载的合同列表
    with open(error,"r") as f:
        for i in f.readlines():
            contractNo=i.strip()
            detail_url="http://www.baidu.com/#/contractDetail?contractNum={}".format(contractNo)
            driver.get(detail_url)
            asleep(driver)
          contractUrl=driver.find_element_by_xpath(".//div[@class='float-left column list-r']/div[4]/div[2]/a").get_attribute("href")
    
            try:
                downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl)
                print("下载成功!")
            except Exception as e:
                print(e)
                print(contractNo)

    code2

    #coding=utf-8
    
    import requests
    import time
    import os
    
    def formatFloat(num):
        return '{:.2f}'.format(num)
    
    #下载文件
    def downloadFile(name, url):
        headers = {'Proxy-Connection':'keep-alive'}
        r = requests.get(url, stream=True, headers=headers)
        length = float(r.headers['content-length'])
        f = open(name, 'wb')
        count = 0
        count_tmp = 0
        time1 = time.time()
        for chunk in r.iter_content(chunk_size = 512):
            if chunk:
                f.write(chunk)
                count += len(chunk)
                if time.time() - time1 > 2:
                    p = count / length * 100
                    speed = (count - count_tmp) / 1024 / 1024 / 2
                    count_tmp = count
                    print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                    time1 = time.time()
        f.close()
        
    #文件保存目录
    file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files")
    if(not os.path.exists(file_dir)):
        os.mkdir(file_dir)
    
    #html合同
    record=os.path.join(os.path.dirname((os.path.abspath(__file__))),"records.txt")
    #保存未下载的合同列表
    with open(record,"r") as f:
        print(f.readlines())
    
    #下载失败的合同
    error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt")
    #保存未下载的合同列表
    with open(error,"r") as f:
        for i in f.readlines():
            contractNo=i.strip()
            contractUrl="http://www.baidu.com/contract/view?contractNo={}".format(contractNo)
    
            try:
                downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl)
            except Exception as e:
                print(e)
                print(contractNo)

  • 相关阅读:
    问题 A: 【递归入门】全排列
    第一个struct2程序(2)
    第一个struct2程序
    Java学习 第二节
    重学Java
    Servlet过滤器
    struct2
    Java web struct入门基础知识
    one by one 项目 part 6
    软件工程导论 桩模块和驱动模块
  • 原文地址:https://www.cnblogs.com/sea-stream/p/14200718.html
Copyright © 2011-2022 走看看