zoukankan      html  css  js  c++  java
  • json.dumps(i['bd_res'], ensure_ascii=False)

    json.dumps(i['bd_res'], ensure_ascii=False)


    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    import math
    import csv
    import pprint
    import json
    from openpyxl import Workbook
    
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    
    def gen_file_data(fodir, fname, sheet_index=0, ):
        if fname.find('.xlsx') > -1:
            fname_open = '%s\%s' % (fodir, fname)
            book = xlrd.open_workbook(fname_open, on_demand=True)
            sheet = book.sheet_by_index(sheet_index)
            data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
            book.release_resources()
            del book
        elif fname.find('.csv') > -1:
            data = []
            fname_open = '%s\%s' % (fodir, fname)
            with open(fname_open, 'r', encoding='utf-8') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    data.append(row)
            csvfile.close()
        return data
    
    
    # 3 9
    request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
                                                                                         '幼儿园', '中学',
                                                                                         '综合医院', '商场']
    # ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
    # ['住宅小区','写字楼']
    
    
    fname_source = 'jfinder_public_jmtool_old_data.csv'
    data_file = gen_file_data(curPath, fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # 碧海富通城三期(3栋) ok
    # =碧海富通城-三期(3栋) ok
    replace_to_empty_l = [' ', '|', '	', '
    ', '/', '?', '?', '·', '.']
    
    
    def gen_bd_query_origin_name(name_):
        for i in replace_to_empty_l:
            name_ = name_.replace(i, '')
        return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '')
    
    
    for l in data_file:
        # db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
        # db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
        dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l
    
        if db_from == 'db_from':
            continue
        request_name = gen_bd_query_origin_name(name_)
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['request_name_list'] = []
            request_dic[city][district]['request_uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if request_name not in request_dic[city][district]['request_name_list']:
            request_dic[city][district]['request_name_list'].append(request_name)
        uid = uid.replace(' ', '')
        if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
            request_dic[city][district]['request_uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_file
    
    
    def get_search_word_res_first_uid(jstr):
        uid, dic_ = '', json.loads(jstr)
        if 'result' in dic_:
            if len(dic_['result']) > 0:
                uid = dic_['result'][0]['uid']
        return uid
    
    
    def get_uid_res_first_uid(jstr):
        uid, dic_ = '', json.loads(jstr)
        if 'result' in dic_:
            if len(dic_['result']) > 0:
                uid = dic_['result'][0]['uid']
        return uid
    
    
    def chk_uid_res_5_field(jstr):
        r, d = False, json.loads(jstr)
        if 'result' in d:
            dd = d['result']
            if 'location' in dd:
                if 'lng' in dd['location'] and 'lat' in dd['location']:
                    if 'name' in dd:
                        if 'address' in dd:
                            r = True
        return r
    
    
    fname_source = 'updating-百度search_word_ret170909152547.xlsx'
    bd_search_word_dic, data_file = {}, gen_file_data(curPath, fname_source)
    for l in data_file:
        search_word, jstr = l
        if search_word == 'search_word':
            continue
        bd_search_word_dic[search_word] = jstr
    del data_file
    
    fname_source = 'updating-百度search_uid_ret170909160924.xlsx'
    bd_city_district_uid_dic, data_file = {}, gen_file_data(curPath, fname_source)
    for l in data_file:
        city_district_uid, jstr = l
        if city_district_uid == 'search_uid':
            continue
        bd_city_district_uid_dic[city_district_uid] = jstr
    del data_file
    
    res_dic = {}
    for city in request_dic:
        for district in request_dic[city]:
            for l in request_dic[city][district]['file_row_list']:
                dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l
                request_name, task_uid, task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng = gen_bd_query_origin_name(
                    name_), '', '', '', '', ''
                if len(uid.replace(' ', '')) > 0:
                    task_uid = uid
                else:
                    if request_name in bd_search_word_dic:
                        jstr = bd_search_word_dic[request_name]
                        task_uid = get_search_word_res_first_uid(jstr)
                if len(task_uid) > 0:
                    city_district_uid = '%s%s%s' % (city, district, task_uid)
                    if city_district_uid in bd_city_district_uid_dic:
                        jstr = bd_city_district_uid_dic[city_district_uid]
                        r = chk_uid_res_5_field(jstr)
                        if r is True:
                            task_uid_d = json.loads(jstr)
                            d = task_uid_d['result']
                            task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng = d['name'], d['address'], 
                                                                                       d['location']['lat'], d['location'][
                                                                                           'lng']
    
                ll = dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_, task_uid, task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng
                if city not in res_dic:
                    res_dic[city] = {}
                if district not in res_dic[city]:
                    res_dic[city][district] = []
                res_dic[city][district].append(ll)
    
    wb = Workbook()
    worksheet = wb.active
    file_title_str = 'dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_, task_uid, task_uid, task_uid_name, task_uid_addr, task_uid_lat, task_uid_lng'
    file_title_l = file_title_str.replace(' ', '').split(',')
    worksheet.append(file_title_l)
    for city in res_dic:
        for district in res_dic[city]:
            for l in res_dic[city][district]:
                worksheet.append(l)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    fname = '9场景添加task_uid'
    file_name = '%s\%s%s' % (curPath, fname, localtime_)
    file_name_save = '%s%s' % (file_name, '.xlsx')
    wb.save(file_name_save)
    wb.close()
    dd = 9
    

      

    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    import math
    import csv
    
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003'
    
    db = 'py_bdspider_status.db'
    db = '%s\%s' % (curPath, db)
    
    
    def db_chk_one_exist(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
        r = 0
        res = c.execute(sql).fetchone()
        if res is not None:
            r = 1
        conn.close
        return r
    
    
    # def db_init_key_table():
    #     conn = sqlite3.connect(db)
    #     c = conn.cursor()
    #     k_file = '%s\%s' % (curPath, 'bdmap_key.txt')
    #     with open(k_file, 'r', encoding='utf-8') as pf:
    #         for i in pf:
    #             if len(i) < 4:
    #                 continue
    #             author, key = i.replace(' ', '').replace('
    ', '').replace('	', '').split(';')
    #             r = db_chk_one_exist(key)
    #             if r == 0:
    #                 localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    #                 sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
    #                     author, key, localtime_, 0)
    #                 c.execute(sql)
    #                 conn.commit()
    #     conn.close()
    #     pf.close()
    #
    #
    # db_init_key_table()
    
    
    
    def db_recovery_bdkeynum():
        if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
            conn = sqlite3.connect(db)
            c = conn.cursor()
            localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
            sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s  ' % (localtime_)
            c.execute(sql)
            conn.commit()
            conn.close()
        return
    
    
    def db_get_one_effective():
        db_recovery_bdkeynum()
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
        res, r = c.execute(sql).fetchone(), ''
        if res is None:
            r = DB_KEY_EXHAUST
        else:
            r = res[0]
        conn.close()
        return r
    
    
    def db_update_one_today_used(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
            localtime_, key)
        c.execute(sql)
        conn.commit()
        conn.close()
    
    
    dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
    requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % (
        curPath, dir_exception)
    requested_file_dir = os.listdir(requested_file_dir_str)
    
    
    def gen_requested_file_list(file_postfix='.html'):
        filepath = '%s\%s' % (curPath, dir_)
        pathDir = os.listdir(filepath)
        for allDir in pathDir:
            child = os.path.join('%s%s' % (filepath, allDir))
            requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
            if requested_file not in requested_file_list:
                requested_file_list.append(requested_file)
    
    
    def gen_file_data(fodir, fname, sheet_index=0, ):
        if fname.find('.xlsx') > -1:
            fname_open = '%s\%s' % (fodir, fname)
            book = xlrd.open_workbook(fname_open, on_demand=True)
            sheet = book.sheet_by_index(sheet_index)
            data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
            book.release_resources()
            del book
        elif fname.find('.csv') > -1:
            data = []
            fname_open = '%s\%s' % (fodir, fname)
            with open(fname_open, 'r', encoding='utf-8') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    data.append(row)
            csvfile.close()
        return data
    
    
    # 3 9
    request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
                                                                                         '幼儿园', '中学',
                                                                                         '综合医院', '商场']
    # ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
    # ['住宅小区','写字楼']
    
    # file_postfix_l = ['.html', '.txt']
    # for i in file_postfix_l:
    #     gen_requested_file_list(i)
    
    fname_source = 'jfinder_public_jmtool_old_data.csv'
    data_file = gen_file_data(curPath, fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # 碧海富通城三期(3栋) ok
    # =碧海富通城-三期(3栋) ok
    replace_to_empty_l = [' ', '|', '	', '
    ', '/', '?', '?', '·', '.']
    
    
    def gen_bd_query_origin_name(name_):
        for i in replace_to_empty_l:
            name_ = name_.replace(i, '')
        return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '')
    
    
    for l in data_file:
        # db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
        # db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
        dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l
    
        if db_from == 'db_from':
            continue
        request_name = gen_bd_query_origin_name(name_)
        input_ = '%s%s%s' % (city, district, request_name)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['request_name_list'] = []
            request_dic[city][district]['request_uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if request_name not in request_dic[city][district]['request_name_list']:
            request_dic[city][district]['request_name_list'].append(request_name)
        uid = uid.replace(' ', '')
        if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
            request_dic[city][district]['request_uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_file
    
    base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'
    ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
            'Address already in use', '天配额超限,限制访问', 'Parameter Invalid']
    
    write_res_file_dir = '%s\%s\' % (curPath, dir_)
    
    
    def write_res_file(str_, input_, ak, dir_=write_res_file_dir, file_postfix='.txt'):
        for ex in ex_l:
            if str_.find(ex) > -1:
                print('EXCEPTION-', ex, 'AK-', ak, 'STR-', str_)
    
                return
        fname = '%s%s%s' % (dir_, input_, file_postfix)
        with open(fname, 'w', encoding='utf-8') as ft:
            ft.write(str_)
        ft.close()
        print('ok', threading.get_ident(), input_)
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.name, self.func, self.args = name, func, args
    
        def run(self):
            self.func(self.args)
    
    
    def fun_(city):
        for district in request_dic[city]:
            for request_name in request_dic[city][district]['request_name_list']:
                ak = db_get_one_effective()
                if ak == DB_KEY_EXHAUST:
                    print(DB_KEY_EXHAUST)
                    break
                else:
                    url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
                    print(url_)
                    input_ = '%s%s%s' % (city, district, request_name)
    
                    bd_res_json_str = requests.get(url_).text
                    db_update_one_today_used(ak)
                    write_res_file(bd_res_json_str, input_, ak)
    
                    # try:
                    #     # gen_requested_file_list()
                    #     # gen_requested_file_list('.txt')
                    #     # if input_ in requested_file_list:
                    #     #     continue
                    #     bd_res_json_str = requests.get(url_).text
                    #     db_update_one_today_used(ak)
                    #     write_res_file(bd_res_json_str, input_)
                    # except Exception:
                    #     bd_res_json_str = '请求百度-异常'
                    #     write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
                    #     print(bd_res_json_str, input_)
    
    
    try:
        start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
    except Exception:
        start_loop, stop_loop = -1, 200
    
    
    def main():
        threads_list, nloop = [], 0
        request_dic_city_l = sorted(request_dic, reverse=False)
        for city in request_dic_city_l:
            nloop += 1
            if nloop < start_loop or nloop > stop_loop:
                continue
            thread_instance = MyThread(fun_, (city), fun_.__name__)
            threads_list.append(thread_instance)
        for t in threads_list:
            t.setDaemon = False
            t.start()
        for t in threads_list:
            t.join()
    
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    网络测量中基于Sketch方法的简单介绍
    Reading SBAR SDN flow-Based monitoring and Application Recognition
    Reading Meticulous Measurement of Control Packets in SDN
    Reading SketchVisor Robust Network Measurement for Sofeware Packet Processing
    ovs加dpdk在日志中查看更多运行细节的方法
    后缀数组
    (转载)LCA问题的Tarjan算法
    Codeforces Intel Code Challenge Final Round (Div. 1 + Div. 2, Combined) A. Checking the Calendar(水题)
    Vijos 1816统计数字(计数排序)
    卡特兰数
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7498300.html
Copyright © 2011-2022 走看看