zoukankan      html  css  js  c++  java
  • 爬虫技术——爬取国外某部门监测数据

    主要技术:
    1. 使用requests.session模拟登录,带着session并以post形式请求要爬取的url(幸好还没有验证码)
    2. 站点id及经纬度以json存在文件中,并读取
    3. 爬取的数据写入Excel文件,csv格式
    4. 将站点加入queue,进行队列分配
    5. 开启多个线程来处理队列,一个线程完成任务后从队列取新的站点进行处理
    6. 使用calendar库来取当月最后一天是多少号,每次取一个月的数据(最终结果有1G多)
    7. 对年月日时间格式进行urlencode处理
    8. 遍历10种类型的交通数据进行url构造
    9. 将取的url中的数据进行分行,第一行为标题,标题中为A字典的键;其他行均为数据,数据再分割成列表,列表中的数据再做为A字典的值,即A字典键为标题,值为数据,由于type在循环,故每个类型的标题不断的增加进A字典,键在不断增加,最终大约80个键
    10. 将上述A字典做为键为日期的值,形成B字典,即B字典键为日期,值为A字典
    11. 将新字典的值遍历写入excel,对于数据中可能不存在的键,填充默认值为0

    # -*- coding: utf-8 -*-
    
    import time, datetime, calendar
    import urllib, requests
    import queue, threading
    import json
    import os
    
    
    def worker(P):
        print("%s	%s %s
    " % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), P, "is working now..."))
        global pace
        while not Q.empty():
            id, coord = Q.get()
            pace+=1
            #3.1 构建Url
            time_start = '%02d/%02d/%04d %02d:%02d'%(month, 1, year, 0, 0)
            time_end   = '%02d/%02d/%04d %02d:%02d'%(month, calendar.monthrange(year,month)[1], year, 23, 59) #获取当月最天一天日期
    
            time_start_encode = urllib.parse.quote(time_start.encode('utf-8'))
            time_end_encode = urllib.parse.quote(time_end.encode('utf-8'))
    
            ts_start = int(time.mktime(time.strptime(time_start, "%m/%d/%Y %H:%M"))) + 8*3600
            ts_end = int(time.mktime(time.strptime(time_end, "%m/%d/%Y %H:%M"))) + 8*3600
    
            #一个站点一个大数据
            big_data = {}
            types = ['flow','occ','speed','truck_flow','truck_prop','vmt','vht','q','tti','del_35']
            for type in types:
                linestr = ""
                report_url = "http://btc.fangbei.org/?report_form=1&dnode=VDS&content=loops&export=text&station_id=%s&s_time_id=%s&s_time_id_f=%s&e_time_id=%s&e_time_id_f=%s&tod=all&tod_from=0&tod_to=0&dow_0=on&dow_1=on&dow_2=on&dow_3=on&dow_4=on&dow_5=on&dow_6=on&holidays=on&q=%s&q2=&gn=hour&agg=on&lane1=on&lane2=on&lane3=on&lane4=on" %(id, ts_start, time_start_encode, ts_end, time_end_encode, type)
    
                # print(report_url)
                #3.2  抓取数据
                report = S.get(report_url)
                # print(report.text)
                list = report.text.split("
    ")
    
                print('线程:%s 进度:%s/%s 年月:%4d%02d 站点:%s 类型:%s 长度:%s 链接:%s
    '%(P, pace, length, year, month, id, type, len(list), report_url))
    
                #list[0] 为标题行 list[i]均为内容
                title = list[0].split("    ")
                for i in range(1, len(list)):
                    item = list[i].split("    ")
                    if (len(item[0]) < 1):
                        continue
                    
                    #获取行中的日期,转为时间戳
                    ts = int(time.mktime(time.strptime(item[0], "%m/%d/%Y %H:%M")))
    
                    #每一行数据 的处理
                    ''' 以title中的字符为键,item中的字符为值,不断的写入字典中'''
                    dict_x = {}
                    for i in range(len(title)):
                        dict_x.setdefault((type+"_"+title[i]).replace(' ', ''), item[i].replace(',', '')) #加上type,区分不同类型下的同一字符
                    
                    '''上三行组成的字典,做为一个新的值,键则为日期,键不存在则新建,存在则更新值进来'''
                    if ts not in big_data:
                        big_data[ts] = dict_x 
                    else:
                        big_data[ts].update(dict_x)
    
            print('线程:%s 进度:%s/%s 年月:%4d%02d 站点:%s 开始写入数据
    '%(P, pace, length, year, month, id)
            #遍历字典,写数据进来
            for ts, data in big_data.items():
                # print(data)
                dt = time.strptime(data["flow_Hour"],'%m/%d/%Y %H:%M')
                F.write('%s,%s,%s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s,
                         %s,%s,%s,%s,%s,%s
    '%(
                        coord["lng"], coord["lat"], id, dt.tm_year, dt.tm_mon, dt.tm_mday, dt.tm_hour, data.get("flow_#LanePoints",0), 
                        data.get("flow_Flow(Veh/Hour)",0), data.get("flow_Lane1Flow(Veh/Hour)",0), data.get("flow_Lane2Flow(Veh/Hour)",0), data.get("flow_Lane3Flow(Veh/Hour)",0), data.get("flow_Lane4Flow(Veh/Hour)",0), data.get("flow_Lane5Flow(Veh/Hour)",0), 
                        data.get("occ_Occupancy(%)",0), data.get("occ_Lane1Occ(%)",0), data.get("occ_Lane2Occ(%)",0), data.get("occ_Lane3Occ(%)",0), data.get("occ_Lane4Occ(%)",0), data.get("occ_Lane5Occ(%)",0), 
                        data.get("speed_Speed(mph)",0), data.get("speed_Lane1Speed(mph)",0), data.get("speed_Lane2Speed(mph)",0), data.get("speed_Lane3Speed(mph)",0), data.get("speed_Lane4Speed(mph)",0), data.get("speed_Lane5Speed(mph)",0), 
                        data.get("truck_flow_TruckFlow(Veh/Hour)",0), data.get("truck_flow_Lane1TruckFlow(Veh/Hour)",0), data.get("truck_flow_Lane2TruckFlow(Veh/Hour)",0), data.get("truck_flow_Lane3TruckFlow(Veh/Hour)",0), data.get("truck_flow_Lane4TruckFlow(Veh/Hour)",0), data.get("truck_flow_Lane5TruckFlow(Veh/Hour)",0), 
                        data.get("truck_prop_TruckProp(%)",0), data.get("truck_prop_Lane1TruckProp(%)",0), data.get("truck_prop_Lane2TruckProp(%)",0), data.get("truck_prop_Lane3TruckProp(%)",0), data.get("truck_prop_Lane4TruckProp(%)",0), data.get("truck_prop_Lane5TruckProp(%)",0), 
                        data.get("vmt_VMT(Veh-Miles)",0), data.get("vmt_Lane1VMT(Veh-Miles)",0), data.get("vmt_Lane2VMT(Veh-Miles)",0), data.get("vmt_Lane3VMT(Veh-Miles)",0), data.get("vmt_Lane4VMT(Veh-Miles)",0), data.get("vmt_Lane5VMT(Veh-Miles)",0), 
                        data.get("vht_VHT(Veh-Hours)",0), data.get("vht_Lane1VHT(Veh-Hours)",0), data.get("vht_Lane2VHT(Veh-Hours)",0), data.get("vht_Lane3VHT(Veh-Hours)",0), data.get("vht_Lane4VHT(Veh-Hours)",0), data.get("vht_Lane5VHT(Veh-Hours)",0), 
                        data.get("q_Q(VMT/VHT)(mph)",0), data.get("q_Lane1Q(mph)",0), data.get("q_Lane2Q(mph)",0), data.get("q_Lane3Q(mph)",0), data.get("q_Lane4Q(mph)",0), data.get("q_Lane5Q(mph)",0), 
                        data.get("tti_TTI",0), data.get("tti_Lane1TTI",0), data.get("tti_Lane2TTI",0), data.get("tti_Lane3TTI",0), data.get("tti_Lane4TTI",0), data.get("tti_Lane5TTI",0), 
                        data.get("del_35_Delay(V_t=35)(Veh-Hours)",0), data.get("del_35_Lane1Delay(35)(Veh-Hours)",0), data.get("del_35_Lane2Delay(35)(Veh-Hours)",0), data.get("del_35_Lane3Delay(35)(Veh-Hours)",0), data.get("del_35_Lane4Delay(35)(Veh-Hours)",0), data.get("del_35_Lane5Delay(35)(Veh-Hours)",0)
                        ))
    
    
    if __name__ == '__main__':
    
        #0. 配置
        os.system("cls")
        year = 2019
        month = 8
        filename = 'data/%s/%4d%02d_V2_%s.csv'%(year, year, month, datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        
        #1. 登录
        print("%s	%4d%02d	%s
    " % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), year, month, "Ready to Login..."))
        url = 'http://btc.fangbei.org/'
        headers={
                # 'Host': 'btc.fangbei.org',
                # 'Proxy-Connection': 'keep-alive',
                # 'Content-Length': '71',
                # 'Cache-Control': 'max-age=0',
                # 'Origin': 'http://btc.fangbei.org',
                # 'Upgrade-Insecure-Requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'User-Agent': '来源 比特量化',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                'Referer': 'http://btc.fangbei.org/',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
                }
    
        S = requests.Session()
        S.headers = headers
        login = S.post(url, json="redirect=&username=yjn%40fangbei.org&password=123456%29&login=Login")
        print("%s	%s
    " % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Ready Go..."))
    
        #2. 获取站点ID,分配进队列
        ides = json.load(open("id.json",'r'))
    
        #写文件
        F = open(filename, 'a+')
        F.write('"Longtitude","Latitude","ID","Year","Month","Day","Hours","LanePoints",
    "Flow","Lane1Flow","Lane2Flow","Lane3Flow","Lane4Flow","Lane5Flow",
    "Occupancy","Lane1Occ","Lane2Occ","Lane3Occ","Lane4Occ","Lane5Occ",
    "Speed","Lane1Speed","Lane2Speed","Lane3Speed","Lane4Speed","Lane5Speed",
    "TruckFlow","Lane1TruckFlow","Lane2TruckFlow","Lane3TruckFlow","Lane4TruckFlow","Lane5TruckFlow",
    "TruckProp","Lane1TruckProp","Lane2TruckProp","Lane3TruckProp","Lane4TruckProp","Lane5TruckProp",
    "VMT","Lane1VMT","Lane2VMT","Lane3VMT","Lane4VMT","Lane5VMT",
    "VHT","Lane1VHT","Lane2VHT","Lane3VHT","Lane4VHT","Lane5VHT",
    "Q","Lane1Q","Lane2Q","Lane3Q","Lane4Q","Lane5Q",
    "TTI","Lane1TTI","Lane2TTI","Lane3TTI","Lane4TTI","Lane5TTI",
    "35_Delay","35_Lane1Delay","35_Lane2Delay","35_Lane3Delay","35_Lane4Delay","35_Lane5Delay"
    ')
        pace = 0
        length = len(ides)
    
        #分配队列
        Q = queue.Queue()
        for id, coord in ides.items():
            Q.put((id, coord))
    
        #开启多个线程
        for i in range(4):
            t = threading.Thread(target=worker,args=('T%s'%(i),))
            # t.setDaemon(True)
            t.start()
            # t.join()

    id.json

    {
        "716391":{
            "lat":34.151032,
            "lng":-118.680765
        },
        "716392":{
            "lat":34.148836,
            "lng":-118.697394
        },
        "716402":{
            "lat":33.931618,
            "lng":-118.394435
        },
        "716404":{
            "lat":33.931388,
            "lng":-118.389223
        }
    }

    数据截图

  • 相关阅读:
    JavaScript闭包基本概念
    JavaScript函数
    JavaScript类型比较
    Java思维导图之Class对象
    Python进阶之装饰器
    Java IO学习要点导图
    sl003完全平方数
    sl002个税计算
    sl001数字拼接
    装饰器
  • 原文地址:https://www.cnblogs.com/bitquant/p/python-crawler-trafficdata.html
Copyright © 2011-2022 走看看