zoukankan      html  css  js  c++  java
  • requests 爬取 台州市 履行中状态的合同附件

    code

    #coding=utf-8
    import requests
    import time
    import os
    
    def formatFloat(num):
        return '{:.2f}'.format(num)
    
    #下载文件
    def downloadFile(name, url):
        headers = {'Proxy-Connection':'keep-alive'}
        r = requests.get(url, stream=True, headers=headers)
        length = float(r.headers['content-length'])
        f = open(name, 'wb')
        count = 0
        count_tmp = 0
        time1 = time.time()
        for chunk in r.iter_content(chunk_size = 512):
            if chunk:
                f.write(chunk)
                count += len(chunk)
                if time.time() - time1 > 2:
                    p = count / length * 100
                    speed = (count - count_tmp) / 1024 / 1024 / 2
                    count_tmp = count
                    print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                    time1 = time.time()
        f.close()
        
    dic={}
    headers={
        'Accept': 'application/json, text/plain, */*'
        ,'client': 'Web'
        ,'Content-Type': 'application/json;charset=UTF-8'
        ,'Origin': "http://baidu.com.com"
        ,'Referer': "http://baidu.com.com"
        ,'token': 'NGKPaLge8urbOlOAuHasURwYP4AKQIo8O1zad5F3vLA='
        ,'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    }
    #文件保存目录
    file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files")
    if(not os.path.exists(file_dir)):
        os.mkdir(file_dir)
    
    #登陆地址
    url="http://baidu.com.com/login/pc"
    payload = {"userName":"abc","passWord":"icloudeep123","kaptcha":"12345"}
    r = requests.get(url, params=payload)
    
    #获取token
    dic["token"]=r.json()['data']['token']
    headers["token"]=dic["token"]
    
    print(dic["token"])
    
    #html合同列表
    html_c_list=[]
    
    #计数
    count=0
    c=0
    
    #无法下载
    error_list=[]
    
    for i in range(1,60):
        #台州市履行中的合同
        url="http://baidu.com.com/contracts?page={}&size=40&contractNo=&regionCode=331000&status=2".format(i)
        r = requests.get(url,headers=headers)
        contract_list=r.json()['data']['content']
        for j in contract_list:
            c+=1
            print("第{}页,第{}个".format(i,c))
            contractUrl=j["contractUrl"]
            contractNo_tmp=j["contractNo"]+"
    "
            if(contractUrl==None):
                html_c_list.append(contractNo_tmp)
            else:
                companyName=j["companyName"]
                contractNo=j["contractNo"]
                try:
                    downloadFile(os.path.join(file_dir,"{}_{}.pdf".format(companyName,contractNo)),contractUrl)
                except Exception as e:
                    error_list.append(contractNo_tmp)
                    print(e)
                else:
                    count+=1
    
    #html合同
    record=os.path.join(os.path.dirname((os.path.abspath(__file__))),"records.txt")
    #保存未下载的合同列表
    with open(record,"a+") as f:
        for k in html_c_list:
            f.write(k)
    
    #下载失败的合同
    error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt")
    #保存未下载的合同列表
    with open(error,"a+") as f:
        for l in error_list:
            f.write(l)  
    
    print("已下载:{}".format(count))
    print("下载失败:{}".format(len(error_list)))
    print("未下载:{}".format(len(html_c_list)))
    
    print("总数:{}".format(count+len(html_c_list)+len(error_list)))

  • 相关阅读:
    day 34
    day 33 线程锁
    day 32 操作系统、线程和进程(GIL锁)
    day 31 网络基础的补充
    day 30 多线程 socketserver模块补充
    python自学笔记 2019/07/01
    类与对象的概念
    递归及三种二分法
    好看的颜色
    zend 汉化
  • 原文地址:https://www.cnblogs.com/sea-stream/p/14200846.html
Copyright © 2011-2022 走看看