zoukankan      html  css  js  c++  java
  • 宜出行人口热力图

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    # Author : zhibo.wang
    # E-mail : d_1206@qq.com
    # Date   : 18/03/23 14:22:58
    # Desc   : qq登陆 , 滑动验证暂没处理
    
    
    import os
    import time
    from selenium import webdriver
    from yichuxing.settings import qq_list
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    
    class Login(object):
        # 登陆qq,获取cookie
        LoginURL = "http://c.easygo.qq.com/eg_toc/map.html?origin=csfw&cityid=110000"
        def __init__(self, **kwargs):
            self.qq_num = kwargs.get("qq_num")
            self.qq_passwd = kwargs.get("qq_passwd")
    
        def after_smoothly_login(self, driver):
            cookie = {}
            for elem in driver.get_cookies():
                cookie[elem["name"]] = elem["value"]
                #driver.quit()
            return cookie
    
        def get_cookie_by_Chrome(self):
            try:
                chromedriver = "C:Program Files (x86)GoogleChromeApplicationchromedriver.exe"
                os.environ["webdriver.chrme.driver"] = chromedriver
                driver = webdriver.Chrome(chromedriver)
                #driver = webdriver.Chrome()
                driver.set_page_load_timeout(10)
                driver.get(self.LoginURL)
                driver.find_element_by_id("u").send_keys(self.qq_num)
                driver.find_element_by_id("p").send_keys(self.qq_passwd)
                driver.maximize_window()
                driver.find_element_by_id("go").click()
                time.sleep(6)
    
                if "宜出行" in driver.title:
                    return self.after_smoothly_login(driver)
                elif "手机统一登录" in driver.title:
                    return None
    
            except Exception:
                # driver.close()
                return None
    
        def get_cookie_by_PhantomJS(self):
            try:
                dcap = dict(DesiredCapabilities.PHANTOMJS)
                dcap["phantomjs.page.settings.userAgent"] = (
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"
                )
                driver  = webdriver.PhantomJS(desired_capabilities=dcap)
                driver.set_page_load_timeout(10)
                driver.get(self.LoginURL)
                driver.find_element_by_id("u").clear()
                driver.find_element_by_id("u").send_keys(self.qq_num)
                driver.find_element_by_id("p").clear()
                driver.find_element_by_id("p").send_keys(self.qq_passwd)
                driver.find_element_by_id("go").click()
                time.sleep(6)
    
                if "宜出行" in driver.title:
                    return self.after_smoothly_login(driver)
                elif "手机统一登录" in driver.title:
                    return None
    
            except:
                # driver.close()
                return None
    
    class CookieException(Exception):
        # 创建一个异常类,用于在cookie失效时抛出异常
        def __init__(self):
            Exception.__init__(self)
    
    """
    if __name__ == "__main__":
        #L = Login(qq_num="xxxx", qq_passwd="xxxx")
        #L.get_cookie_by_Chrome()
    """
    

      

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    # Author : zhibo.wang
    # E-mail : d_1206@qq.com
    # Date   : 18/03/23 14:22:58
    # Desc   : 宜出行热力图
    
    
    import hashlib
    import socket
    import os
    import json
    import time
    import random
    import datetime
    import requests
    from yichuxing.settings import qq_list, s_fre, proxyMeta, is_proxy
    from requests.exceptions import RequestException
    #from utils.user_angents import agents
    from data_utils.ali_oss import OSS2
    from data_utils.time_convert import get_time_stamp
    from yichuxing.yichuxing_utils.qqlogin import CookieException, Login
    from data_utils.conmongodb import mongo_con_keepalive
    from yichuxing.yichuxing_utils.create_grid import create_grid_by_center, get_gd_data
    
    
    class Crawl():
        db = mongo_con_keepalive()
        header = {
                  "Host": "c.easygo.qq.com",
                  "Connection": "keep-alive",
                  "Accept": "application/json",
                  "Accept-Encoding": "gzip, deflate",
                  "Accept-Language": "zh-CN,zh;q=0.9",
                  "X-Requested-With": "XMLHttpRequest",
                  "Referer": "http://c.easygo.qq.com/eg_toc/map.html?origin=csfw",
                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
                }
        start_url = "http://c.easygo.qq.com/api/egc/heatmapdata"
        cookie_data = None
        if is_proxy:
            wait_time = [0.16, 0.17]
        else:
            wait_time = [3, 3.1, 3.2, 3.3, 3.4]
    
        time_stamp = get_time_stamp()
        time_local = time.localtime(int(time_stamp))
        date = time.strftime("%Y-%m-%d", time_local)
        proxies = {
            "http": proxyMeta,
            "https": proxyMeta,
        }
        fre_data = {"qq": None,"pwd": None}
        fre = 0
        pid = os.getpid()
        oss = OSS2()
        path_dir = None
        website = "population_yichuxing"
        qq_status = "yichuxing_qq_status"
    
        def __init__(self):
            self.path_dir = "population/yichuxing/{0}/".format(self.time_stamp)
            self.db.get_collection('pathdir_dict').insert_one(
                {'pathdir': self.path_dir, 'website': self.website, 'flag': False}
            )
            if self.db.get_collection(self.qq_status).find_one({"date": self.date}) is None:
                self.db.get_collection(self.qq_status).remove({})
                print("新的一天,新的开始 初始化所有账号")
                self.db.get_collection(self.qq_status).insert_many(
                    [{"qq": i["qq"], "pwd": i["pwd"], "n": 0,
                      'status': False, "date": self.date} for i in qq_list]
                    )
            super(Crawl, self).__init__()
    
        def kill(self):
            try:
                os.system("kill {0}".format(self.pid))
            except OSError as e:
                print("kill pid error: ", e)
    
        def get_cookie(self):
            all_qq = self.db.get_collection(self.qq_status).find(
                {"status": False}, {"_id": 0}
            )
            qq_list = [i for i in all_qq]
            if len(qq_list) > 0:
                self.fre = 0
                self.fre_data = random.choice(qq_list)
                qq_num = self.fre_data.get("qq")
                qq_passwd = self.fre_data.get("pwd")
                L = Login(qq_num=qq_num, qq_passwd=qq_passwd)
                cookie_data = L.get_cookie_by_PhantomJS()
                #cookie_data = L.get_cookie_by_Chrome()
                if cookie_data:
                    self.cookie_data = cookie_data
            elif len(qq_list) == 0:
                print("没有账号了, 杀死自己")
                self.kill()
    
    
        def spyder_params(self, item):
            # 生成 请求参数
            params = {"lng_min": item.get("lng_min"),
                      "lat_max": item.get("lat_max"),
                      "lng_max": item.get("lng_max"),
                      "lat_min": item.get("lat_min"),
                      "level": 16,
                      "city": "",
                      "lat": "undefined",
                      "lng": "undefined",
                      "_token": ""
                    }
            return params
    
        def spyder(self, params):
            time.sleep(random.choice(self.wait_time))
            try:
                if self.fre >= s_fre:
                    print("账号: {0}, 抓取次数达到上限, 更换qq账号".format(self.fre_data.get("qq")))
                    qq = self.fre_data.get("qq")
                    self.db.get_collection(self.qq_status).update_one(
                        {"qq": qq}, {"$set": {"status": True}}
                    )
                    self.get_cookie()
                if is_proxy:
                    r = requests.get(self.start_url, headers=self.header,
                                 cookies=self.cookie_data, params=params, proxies=self.proxies)
                else:
    
                    r = requests.get(self.start_url, headers=self.header,
                                 cookies=self.cookie_data, params=params)
                if r.status_code == 200:
                    self.fre = self.fre + 1
                    try:
                        return r.json()
                    except:
                        raise CookieException
                else:
                    raise CookieException
            except RequestException :
                self.spyder(params)
    
        def get(self, params):
            data_json = None
            try:
                data_json = self.spyder(params)
            except CookieException:
                print("账号: {0}, cookie 失效,获取新账号登陆, 并抓取".format(
                      self.fre_data.get("qq")))
                qq = self.fre_data.get("qq")
                self.db.get_collection(self.qq_status).update_one(
                    {"qq": qq}, {"$set": {"status": True}}
                )
                self.get_cookie()
                data_json = self.spyder(params)
            return data_json
    
        def create_filename(self, url):
            # 生成文件名
            fname = '%s_%s_%s_%s.json' % (socket.gethostname(),
                                              url.split('//')[-1].split('/')[0].replace('.', '-'),
                                              hashlib.md5(url.encode()).hexdigest(),
                                              str(time.time()).split('.')[0])
            return fname
    
        def start(self):
            self.get_cookie()
            for i in get_gd_data():
                print("begin: ", i)
                latlng_dict = create_grid_by_center(i)
                print("将要抓取的次数: ", len(latlng_dict))
                for o in latlng_dict:
                    print("抓取范围: ", o)
                    params = self.spyder_params(o)
                    data_json = self.get(params)
                    file_ = "{0}{1}".format(self.path_dir, self.create_filename("{0}{1}".format(self.start_url, params)))
                    if data_json.get("code") != 0:
                        print("code: {0}, 获取新的账号,再一次抓取".format(data_json.get("code")))
                        qq = self.fre_data.get("qq")
                        self.db.get_collection(self.qq_status).update_one(
                            {"qq": qq}, {"$set": {"status": True}}
                        )
                        self.get_cookie()
                        data_json = self.get(params)
    
                    if data_json.get("code") == 0 and len(data_json.get("data")) > 0:
                        data_json["cityname"] = o["cityname"]
                        #print(data_json)
                        self.oss.uploadfiledata(file_, json.dumps(data_json))
            co = self.db.get_collection(self.qq_status).find({"status": False}).count()
            print("剩余可用qq count: ", co)
    
    
    
    if __name__ == "__main__":
        c = Crawl()
        c.start()
    

      

    # 每个账号抓取次数
    s_fre = 70
    # 每次爬取方格的边长(0.04 > 4公里) 平移量
    lat_offset = 0.04
    lng_offset = 0.04
    # 是否开始代理 True:开启, False:不开启
    is_proxy = True
    grade = {0:6, 1: 6, 2: 5, 3: 4, 4: 4, 5: 4}  # 城市对应 抓取圈数
    # 代理ip地址
    proxyMeta = "http://xxx:xxx@proxy.abuyun.com:9020" 
    # qq 账号
    qq_list = [
    {"qq": "xxx", "pwd": "xxx"},
    ]
    

      

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    # Author : zhibo.wang
    # E-mail : d_1206@qq.com
    # Date   : 18/03/23 16:28:43
    # Desc   :
    
    import json
    import numpy as np
    from yichuxing.settings import lat_offset, lng_offset, grade
    from data_utils.conmongodb import mongo_con_keepalive
    from data_utils.location_convert import bd09togcj02
    
    
    db = mongo_con_keepalive()
    
    def get_gd_data():
        city_list = db.get_collection("params_citys").find({"exists_city": True}, {"_id": 0}).sort(
            "class")
        location = []
        for city in city_list:
            if city.get("province") != "广东省":
                if city.get("class") == 3:
                    center_lng = city.get("center_lng")
                    center_lat = city.get("center_lat")
                    del city["center_lng"]
                    del city["center_lat"]
                    lng, lat = bd09togcj02(center_lng, center_lat)  # 转腾讯坐标系
                    city["lng"] = lng
                    city["lat"] = lat
                    location.append(city)
        return location
    
    def create_grid_by_center(location, n=None):
        # 以城市中心点辐射n圈 即 4*4*(5*2)`2 1600平方公里
        # 4*4 是每个区域的大小 区域大小可在setting里设置, (5*2)`2  5是圈数
    
    
        lng, lat, city_class, cityname = location["lng"], location["lat"], location["class"], location["cityname"]
        if n is None:
            n = grade.get(city_class)
        n = float(n)
        bottom_lat, top_lat = lat - lat_offset*n, lat + lat_offset*n
        left_lng, right_lng = lng - lng_offset*n, lng + lng_offset*n
        lat_range = np.arange(bottom_lat, top_lat, lat_offset)
        end_data = []
    
        for lat_ in lat_range:
            lng_range = np.arange(left_lng, right_lng, lng_offset)
            for lng_ in lng_range:
                end_data.append({"lng_min": lng_,
                                "lat_max":lat_ + lat_offset ,
                                "lng_max": lng_ + lng_offset,
                                "lat_min": lat_,
                                "cityname": cityname})
        return end_data
    

      

            "cityname" : "北京市",
    	"province" : "北京市",
    	"citycode" : "131",
    	"center_lat" : 39.904211,   # 百度坐标
    	"center_lng" : 116.407394,
    	"class" : 0,
    	"ftx_code" : "bj",
    	"meituan_code" : "beijing",
    	"meituan_id" : 1,
    	"dianping_id" : 2,
    	"dianping_code" : "beijing",
    	"gd_adcode" : "110000",
    	"gd_citycode" : "010",
    	"shunqi_code" : "beijing",
    	"xiecheng_code" : "BJS",
    	"xiecheng_status" : true,
    	"zhilian_code" : "beijing",
    	"baidu_id" : 131,
    	"exists_city" : true  
    {
        "scale" : "20,50,100,200",
        "lng_a" : 116.550125,
        "lat_a" : 39.843624999999996,
        "lng_b" : 116.55662935278988,
        "lat_b" : 39.84962393215385,
        "lng_g" : 116.54429316621265,
        "lat_g" : 39.842540318493164,
        "gps_s" : "a",
        "count" : 800,
        "grid_y" : 159374,
        "grid_x" : 466200,
        "max_data" : 32000,
        "crawl_time" : "2018-05-29 10:03:37",
        "city" : "北京市",
    }

    经纬度解密代码

    http://c.easygo.qq.com/eg_toc/js/map-d76c21c16d.bundle.js 

                lng = 1e-6 * (250.0 * d['grid_x'] + 125.0)

                lat = 1e-6 * (250.0 * d['grid_y'] + 125.0)

      

     教程仅供技术研究学习使用,若有侵权,联系本人删除

  • 相关阅读:
    运维IT必备程序安装包
    网络基础TCP三次握手四次挥手
    新建Weblogic域启动报BEA090403和BEA000386提示密码认证有问题
    Weblogic开发模式和生产产品模式互换
    web api 返回 去除双引号转义符
    freeswitch esl :Rejected by acl “loopback.auto“问题
    freeswitch SIP 服务器一些常用配置
    因果推断综述
    Django 项目配置拆分独立
    wayne编译支持k8s1.16+
  • 原文地址:https://www.cnblogs.com/dockers/p/9238535.html
Copyright © 2011-2022 走看看