zoukankan      html  css  js  c++  java
  • ofo小黄车数据抓取

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
     
    # Author : zhibo.wang
    # E-mail : d_1206@qq.com
    # Desc   : ofo
    
    
    import time
    import random
    import socket
    import hashlib
    import datetime
    import threading
    import numpy as np
    from more_itertools import chunked 
    from requests_toolbelt import MultipartEncoder
    
    
    
    class Crawler:
        oss = OSS2()
        W = Weixin()
        def __init__(self):
            #self.city_code = [131, 289, 257, 340]
            # self.city_code = {"citycode": 257, "cityname": "广州市"}      # 城市code
            self.city_code = {"citycode": 131, "cityname": "北京市"}  # 城市code
            self.timeout = 10         # 超时时间
            self.offset = 0.0022      # 平移量
            self.indexs = None
            self.db = mongo_con_keepalive()
            self.start_time = datetime.datetime.now()
            self.url = "https://san.ofo.so/ofo/Api/nearbyofoCar" 
            self.wait_time = [0.9, 1, 1.1, 1.2, 1.3] # 间隔时间 
            # 用户token,可用抓包工具抓取 
            self.keys = [{ "Content-Type":"multipart/form-data; boundary=--------FormDataxxx", "boundary": "--------FormDataxxx"},
                 {
                          "Content-Type":"multipart/form-data; boundary=--------FormDataxxx", 
                          "boundary": "--------FormDataxxx"},
                        ]
            self.headers = {
                            "Accept": "*/*",
                            "Host": "san.ofo.so",
                            "Accept-Language": "zh-CN",
                            "Origin": "https://common.ofo.so",
                            "Accept-Encoding": "gzip, deflate",
                            "Referer": "https://common.ofo.so/newdist/?Journey",
                            "User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_1 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C153 MicroMessenger/6.6.0 NetType/WIFI Language/zh_CN"
                           }
            
        def request(self, lat, lng, gridid):
            # 获取相关 数据
            key = random.choice(self.keys)
            fields={
                    "token": key["token"],
                    "lat": lat,
                    "lng": lng,
                    "source": "-5",
                   }
            multipart_encoder = MultipartEncoder(fields=fields,
                                                 boundary=key["boundary"])
            headers = self.headers
            headers["Content-Type"] = key["Content-Type"]
            date = datetime.datetime.now()
            response = requests.post(url=self.url, headers=headers, data=multipart_encoder, timeout=self.timeout)
            try:
                data = response.json()
                if data["errorCode"] == 200:
                    if len(data["values"]["info"]["cars"]) != 0:
                        file_name = self.create_file_name("{0},{1}".format(lat, lng), ".json")
                        data["center_lng"], data["center_lat"] = float(lng), float(lat)
                        data["citycode"] = self.city_code["citycode"]
                        data["cityname"] = self.city_code["cityname"] 
                        data["gridid"] = gridid
                else:
                    print(data)
            except Exception as e:
                print("request error: ", e)
            
    
        def get_city_gridid(self, gridid_data, db):
            # 根据偏移量生成 坐标
            for i in gridid_data:
                gridid = i["gridid"]
                print("gridid: ", gridid)
                try:
                    # 转换坐标
                    left_lng, top_lat = bd09togcj02(i["left_lng"], i["top_lat"])
                    right_lng, bottom_lat = bd09togcj02(i["right_lng"], i["bottom_lat"])
                    lat_range = np.arange(float(bottom_lat), float(top_lat), self.offset)[1:]
                    for lat in lat_range:
                        lng_range = np.arange(float(left_lng), float(right_lng), self.offset)[1:]
                        for lng in lng_range:
                            self.request(str(lat), str(lng), gridid)
                            time.sleep(random.choice(self.wait_time))
                except Exception as e:
                    print("get_city_gridid error:", i, e)
    
    
        def start(self):
            all_data = self.db.get_collection("active_grids").find({"citycode": self.city_code["citycode"]}, no_cursor_timeout=True)
            print("count: ", all_data.count())
            all_data_list = list(chunked(list(all_data), int(all_data.count()/len(self.keys))))
            p = []
            for i in range(0, len(all_data_list)):
                t = threading.Thread(target=self.get_city_gridid, args=(all_data_list[i], self.db))
                p.append(t)
    
            for x in p:
                x.start()
    
            for x in p:
                x.join()
    
    
    if __name__ == "__main__":
        c = Crawler()
        c.start()
      
    
    {	"carno" : "EXxvn8",
    	"ordernum" : "",
    	"userIdLast" : "1",
    	"lng" : 113.24468731813714,
    	"lat" : 23.273194605097277,
    	"Time" : "2018-03-27 19:37:16",
    	"recordBatchNo" : "19"}
      
    

      

  • 相关阅读:
    生产者消费者模型
    varchar2存储汉字,英文字符,数字在oracle中的多少
    正则表达式以及邮箱
    爬虫
    创建git本地仓库和GitHub远程仓库并配置连接的从无到有
    CSSHTML实现高度宽度自适应
    实现一个元素在当前窗口垂直水平居中的几种方法
    angular4 rxjs 异步处理多个http请求数据
    angular 4 父子组件异步交互
    同步异步单线程多线程初级理解
  • 原文地址:https://www.cnblogs.com/dockers/p/9817638.html
Copyright © 2011-2022 走看看