zoukankan      html  css  js  c++  java
  • json.dumps(i['bd_res'], ensure_ascii=False)

    json.dumps(i['bd_res'], ensure_ascii=False)


    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    import math
    import csv
    import pprint
    import json
    from openpyxl import Workbook
    
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    
    def gen_file_data(fodir, fname, sheet_index=0, ):
        if fname.find('.xlsx') > -1:
            fname_open = '%s\%s' % (fodir, fname)
            book = xlrd.open_workbook(fname_open, on_demand=True)
            sheet = book.sheet_by_index(sheet_index)
            data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
            book.release_resources()
            del book
        elif fname.find('.csv') > -1:
            data = []
            fname_open = '%s\%s' % (fodir, fname)
            with open(fname_open, 'r', encoding='utf-8') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    data.append(row)
            csvfile.close()
        return data
    
    
    # 3 9
    request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
                                                                                         '幼儿园', '中学',
                                                                                         '综合医院', '商场']
    # ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
    # ['住宅小区','写字楼']
    
    
    fname_source = 'jfinder_public_jmtool_old_data.csv'
    data_file = gen_file_data(curPath, fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # 碧海富通城三期(3栋) ok
    # =碧海富通城-三期(3栋) ok
    replace_to_empty_l = [' ', '|', '	', '
    ', '/', '?', '?', '·', '.']
    
    
    def gen_bd_query_origin_name(name_):
        for i in replace_to_empty_l:
            name_ = name_.replace(i, '')
        return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '')
    
    
    for l in data_file:
        # db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
        # db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
        dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l
    
        if db_from == 'db_from':
            continue
        request_name = gen_bd_query_origin_name(name_)
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['request_name_list'] = []
            request_dic[city][district]['request_uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if request_name not in request_dic[city][district]['request_name_list']:
            request_dic[city][district]['request_name_list'].append(request_name)
        uid = uid.replace(' ', '')
        if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
            request_dic[city][district]['request_uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_file
    
    
    def get_search_word_res_first_uid(jstr):
        uid, dic_ = '', json.loads(jstr)
        if 'result' in dic_:
            if len(dic_['result']) > 0:
                uid = dic_['result'][0]['uid']
        return uid
    
    
    def get_uid_res_first_uid(jstr):
        uid, dic_ = '', json.loads(jstr)
        if 'result' in dic_:
            if len(dic_['result']) > 0:
                uid = dic_['result'][0]['uid']
        return uid
    
    
    def chk_uid_res_5_field(jstr):
        r, d = False, json.loads(jstr)
        if 'result' in d:
            dd = d['result']
            if 'location' in dd:
                if 'lng' in dd['location'] and 'lat' in dd['location']:
                    if 'name' in dd:
                        if 'address' in dd:
                            r = True
        return r
    
    
    fname_source = 'updating-百度search_word_ret170909152547.xlsx'
    bd_search_word_dic, data_file = {}, gen_file_data(curPath, fname_source)
    for l in data_file:
        search_word, jstr = l
        if search_word == 'search_word':
            continue
        bd_search_word_dic[search_word] = jstr
    del data_file
    
    fname_source = 'updating-百度search_uid_ret170909160924.xlsx'
    bd_city_district_uid_dic, data_file = {}, gen_file_data(curPath, fname_source)
    for l in data_file:
        city_district_uid, jstr = l
        if city_district_uid == 'search_uid':
            continue
        bd_city_district_uid_dic[city_district_uid] = jstr
    del data_file
    
    res_dic = {}
    for city in request_dic:
        for district in request_dic[city]:
            for l in request_dic[city][district]['file_row_list']:
                dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l
                request_name, task_uid, task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng = gen_bd_query_origin_name(
                    name_), '', '', '', '', ''
                if len(uid.replace(' ', '')) > 0:
                    task_uid = uid
                else:
                    if request_name in bd_search_word_dic:
                        jstr = bd_search_word_dic[request_name]
                        task_uid = get_search_word_res_first_uid(jstr)
                if len(task_uid) > 0:
                    city_district_uid = '%s%s%s' % (city, district, task_uid)
                    if city_district_uid in bd_city_district_uid_dic:
                        jstr = bd_city_district_uid_dic[city_district_uid]
                        r = chk_uid_res_5_field(jstr)
                        if r is True:
                            task_uid_d = json.loads(jstr)
                            d = task_uid_d['result']
                            task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng = d['name'], d['address'], 
                                                                                       d['location']['lat'], d['location'][
                                                                                           'lng']
    
                ll = dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_, task_uid, task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng
                if city not in res_dic:
                    res_dic[city] = {}
                if district not in res_dic[city]:
                    res_dic[city][district] = []
                res_dic[city][district].append(ll)
    
    wb = Workbook()
    worksheet = wb.active
    file_title_str = 'dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_, task_uid, task_uid, task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng'
    file_title_l = file_title_str.replace(' ', '').split(',')
    worksheet.append(file_title_l)
    for city in res_dic:
        for district in res_dic[city]:
            for l in res_dic[city][district]:
                worksheet.append(l)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    fname = '9场景添加task_uid'
    file_name = '%s\%s%s' % (curPath, fname, localtime_)
    file_name_save = '%s%s' % (file_name, '.xlsx')
    wb.save(file_name_save)
    wb.close()
    dd = 9
    

      

    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    import math
    import csv
    
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003'
    
    db = 'py_bdspider_status.db'
    db = '%s\%s' % (curPath, db)
    
    
    def db_chk_one_exist(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
        r = 0
        res = c.execute(sql).fetchone()
        if res is not None:
            r = 1
        conn.close
        return r
    
    
    # def db_init_key_table():
    #     conn = sqlite3.connect(db)
    #     c = conn.cursor()
    #     k_file = '%s\%s' % (curPath, 'bdmap_key.txt')
    #     with open(k_file, 'r', encoding='utf-8') as pf:
    #         for i in pf:
    #             if len(i) < 4:
    #                 continue
    #             author, key = i.replace(' ', '').replace('
    ', '').replace('	', '').split(';')
    #             r = db_chk_one_exist(key)
    #             if r == 0:
    #                 localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    #                 sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
    #                     author, key, localtime_, 0)
    #                 c.execute(sql)
    #                 conn.commit()
    #     conn.close()
    #     pf.close()
    #
    #
    # db_init_key_table()
    
    
    
    def db_recovery_bdkeynum():
        if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
            conn = sqlite3.connect(db)
            c = conn.cursor()
            localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
            sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s  ' % (localtime_)
            c.execute(sql)
            conn.commit()
            conn.close()
        return
    
    
    def db_get_one_effective():
        db_recovery_bdkeynum()
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
        res, r = c.execute(sql).fetchone(), ''
        if res is None:
            r = DB_KEY_EXHAUST
        else:
            r = res[0]
        conn.close()
        return r
    
    
    def db_update_one_today_used(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
            localtime_, key)
        c.execute(sql)
        conn.commit()
        conn.close()
    
    
    dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
    requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % (
        curPath, dir_exception)
    requested_file_dir = os.listdir(requested_file_dir_str)
    
    
    def gen_requested_file_list(file_postfix='.html'):
        filepath = '%s\%s' % (curPath, dir_)
        pathDir = os.listdir(filepath)
        for allDir in pathDir:
            child = os.path.join('%s%s' % (filepath, allDir))
            requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
            if requested_file not in requested_file_list:
                requested_file_list.append(requested_file)
    
    
    def gen_file_data(fodir, fname, sheet_index=0, ):
        if fname.find('.xlsx') > -1:
            fname_open = '%s\%s' % (fodir, fname)
            book = xlrd.open_workbook(fname_open, on_demand=True)
            sheet = book.sheet_by_index(sheet_index)
            data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
            book.release_resources()
            del book
        elif fname.find('.csv') > -1:
            data = []
            fname_open = '%s\%s' % (fodir, fname)
            with open(fname_open, 'r', encoding='utf-8') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    data.append(row)
            csvfile.close()
        return data
    
    
    # 3 9
    request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
                                                                                         '幼儿园', '中学',
                                                                                         '综合医院', '商场']
    # ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
    # ['住宅小区','写字楼']
    
    # file_postfix_l = ['.html', '.txt']
    # for i in file_postfix_l:
    #     gen_requested_file_list(i)
    
    fname_source = 'jfinder_public_jmtool_old_data.csv'
    data_file = gen_file_data(curPath, fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # 碧海富通城三期(3栋) ok
    # =碧海富通城-三期(3栋) ok
    replace_to_empty_l = [' ', '|', '	', '
    ', '/', '?', '?', '·', '.']
    
    
    def gen_bd_query_origin_name(name_):
        for i in replace_to_empty_l:
            name_ = name_.replace(i, '')
        return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '')
    
    
    for l in data_file:
        # db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
        # db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
        dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l
    
        if db_from == 'db_from':
            continue
        request_name = gen_bd_query_origin_name(name_)
        input_ = '%s%s%s' % (city, district, request_name)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['request_name_list'] = []
            request_dic[city][district]['request_uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if request_name not in request_dic[city][district]['request_name_list']:
            request_dic[city][district]['request_name_list'].append(request_name)
        uid = uid.replace(' ', '')
        if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
            request_dic[city][district]['request_uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_file
    
    base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'
    ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
            'Address already in use', '天配额超限,限制访问', 'Parameter Invalid']
    
    write_res_file_dir = '%s\%s\' % (curPath, dir_)
    
    
    def write_res_file(str_, input_, ak, dir_=write_res_file_dir, file_postfix='.txt'):
        for ex in ex_l:
            if str_.find(ex) > -1:
                print('EXCEPTION-', ex, 'AK-', ak, 'STR-', str_)
    
                return
        fname = '%s%s%s' % (dir_, input_, file_postfix)
        with open(fname, 'w', encoding='utf-8') as ft:
            ft.write(str_)
        ft.close()
        print('ok', threading.get_ident(), input_)
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.name, self.func, self.args = name, func, args
    
        def run(self):
            self.func(self.args)
    
    
    def fun_(city):
        for district in request_dic[city]:
            for request_name in request_dic[city][district]['request_name_list']:
                ak = db_get_one_effective()
                if ak == DB_KEY_EXHAUST:
                    print(DB_KEY_EXHAUST)
                    break
                else:
                    url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
                    print(url_)
                    input_ = '%s%s%s' % (city, district, request_name)
    
                    bd_res_json_str = requests.get(url_).text
                    db_update_one_today_used(ak)
                    write_res_file(bd_res_json_str, input_, ak)
    
                    # try:
                    #     # gen_requested_file_list()
                    #     # gen_requested_file_list('.txt')
                    #     # if input_ in requested_file_list:
                    #     #     continue
                    #     bd_res_json_str = requests.get(url_).text
                    #     db_update_one_today_used(ak)
                    #     write_res_file(bd_res_json_str, input_)
                    # except Exception:
                    #     bd_res_json_str = '请求百度-异常'
                    #     write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
                    #     print(bd_res_json_str, input_)
    
    
    try:
        start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
    except Exception:
        start_loop, stop_loop = -1, 200
    
    
    def main():
        threads_list, nloop = [], 0
        request_dic_city_l = sorted(request_dic, reverse=False)
        for city in request_dic_city_l:
            nloop += 1
            if nloop < start_loop or nloop > stop_loop:
                continue
            thread_instance = MyThread(fun_, (city), fun_.__name__)
            threads_list.append(thread_instance)
        for t in threads_list:
            t.setDaemon = False
            t.start()
        for t in threads_list:
            t.join()
    
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    element-ui Notification重叠问题,原因及解决办法
    详解CSS3实现无限循环的无缝滚动
    js监听浏览器离开页面操作
    判断浏览器
    轮播动效 | 环形进度条 -- 等 动效库
    拓扑插件搜集
    jquery-图片懒加载
    [原]开源的视频转换器,支持gpu,绝对好用ffmpeg的GUI==》dmMediaConverter最新版本2.3
    【原】font-awesome IE6支持代码本人测试成功
    【原创】 c#单文件绿色资源自更新
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7498300.html
Copyright © 2011-2022 走看看