import xlrd import time import sys import os import requests import sqlite3 import threading curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003' db = 'py_bdspider_status.db' db = '%s\%s' % (curPath, db) def db_chk_one_exist(key): conn = sqlite3.connect(db) c = conn.cursor() sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key) r = 0 res = c.execute(sql).fetchone() if res is not None: r = 1 conn.close return r # def db_init_key_table(): # conn = sqlite3.connect(db) # c = conn.cursor() # k_file = '%s\%s' % (curPath, 'bdmap_key.txt') # with open(k_file, 'r', encoding='utf-8') as pf: # for i in pf: # if len(i) < 4: # continue # author, key = i.replace(' ', '').replace(' ', '').replace(' ', '').split(';') # r = db_chk_one_exist(key) # if r == 0: # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) # sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % ( # author, key, localtime_, 0) # c.execute(sql) # conn.commit() # conn.close() # pf.close() # # # db_init_key_table() def db_recovery_bdkeynum(): if time.strftime("%H%M%S", time.localtime()) == next_day_tag: conn = sqlite3.connect(db) c = conn.cursor() localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s ' % (localtime_) c.execute(sql) conn.commit() conn.close() return def db_get_one_effective(): db_recovery_bdkeynum() conn = sqlite3.connect(db) c = conn.cursor() sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES) res, r = c.execute(sql).fetchone(), '' if res is None: r = DB_KEY_EXHAUST else: r = res[0] conn.close() return r def db_update_one_today_used(key): conn = sqlite3.connect(db) c = conn.cursor() localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % ( localtime_, key) c.execute(sql) conn.commit() conn.close() dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', [] requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % ( curPath, dir_exception) requested_file_dir = os.listdir(requested_file_dir_str) ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved', 'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] def rm_invalid_file(file_postfix='.txt'): filepath = '%s\%s\' % (curPath, dir_) file_l = os.listdir(filepath) for i in file_l: fdir_o = os.path.join('%s%s' % (filepath, i)) file_postfix_ = os.path.splitext(fdir_o)[1] if file_postfix_ == file_postfix: with open(fdir_o, 'r', encoding='utf-8') as ft: jstr = '' for i in ft: jstr = '%s%s' % (jstr, i) ft.close() for ex in ex_l: if jstr.find(ex) > -1: statinfo = os.stat(fdir_o) strftime_st_ctime = time.strftime("%y%m%d%H%M%S", time.localtime(statinfo.st_ctime)) try: os.remove(fdir_o) print('remove', ex, strftime_st_ctime, fdir_o) except Exception: print('multiprocess--multithreading--', fdir_o) rm_invalid_file() def gen_requested_file_list(file_postfix='.html'): filepath = '%s\%s' % (curPath, dir_) pathDir = os.listdir(filepath) for allDir in pathDir: child = os.path.join('%s%s' % (filepath, allDir)) requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0] if requested_file not in requested_file_list: requested_file_list.append(requested_file) file_postfix_l = ['.txt'] for i in file_postfix_l: gen_requested_file_list(i) def gen_file_data(fname_source, file_type='.xlsx'): fname_open = '%s\%s' % (curPath, fname_source) excel_ = '%s%s' % (fname_open, file_type) book = xlrd.open_workbook(excel_, on_demand=True) sheet = book.sheet_by_index(0) data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)] book.release_resources() del book return data request_dic, target_type_list, target_type_except_list = {}, [], [] fname_source = '【SOURCE】采集员新增任务133598条-楼宇归集-互异百度uid数51700' data_selfadd = gen_file_data(fname_source) def replace_illeagl_tag(str_): l = [' ', ' ', ' '] for i in l: str_ = str_.replace(i, '') return str_ # 碧海富通城三期(3栋) ok # =碧海富通城-三期(3栋) ok replace_to_empty_l = [' ', '|', ' ', ' ', '/', '?', '?', '·', '.'] def gen_bd_query_origin_name(name_): for i in replace_to_empty_l: name_ = name_.replace(i, '') return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '') for l in data_selfadd: dbid, area_code, type_, city, district, uid, name_, address, street, request_name, submit_time = l # if city != '深圳市': # continue # if len(uid.replace(' ', '')) > 0: # continue request_name = gen_bd_query_origin_name(name_) request_name_chk = '%s%s%s' % (city, district, request_name) if request_name_chk in requested_file_list: continue if city not in request_dic: request_dic[city] = {} if district not in request_dic[city]: request_dic[city][district] = {} request_dic[city][district] = [] if request_name not in request_dic[city][district]: request_dic[city][district].append(request_name) del data_selfadd fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339' data_jmtool = gen_file_data(fname_source) for l in data_jmtool: dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l # if len(uid.replace(' ', '')) > 0: # continue # if city != '深圳市': # continue request_name = gen_bd_query_origin_name(name_) request_name_chk = '%s%s%s' % (city, district, request_name) if request_name_chk in requested_file_list: continue if city not in request_dic: request_dic[city] = {} if district not in request_dic[city]: request_dic[city][district] = {} request_dic[city][district] = [] if request_name not in request_dic[city][district]: request_dic[city][district].append(request_name) del data_jmtool write_res_file_dir = '%s\%s\' % (curPath, dir_) def write_res_file(input_, str_, dir_=write_res_file_dir, file_postfix='.txt'): for ex in ex_l: if str_.find(ex) > -1: global ak, url_ print('EXCEPTION-', ex, 'AK-', ak, 'URL-', url_) return fname = '%s%s%s' % (dir_, input_, file_postfix) with open(fname, 'w', encoding='utf-8') as ft: ft.write(str_) ft.close() print('ok', threading.get_ident(), input_) class MyThread(threading.Thread): def __init__(self, func, args, name): threading.Thread.__init__(self), self.func, self.args = name, func, args def run(self): self.func(self.args) #瀛嘉天下®ion=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1 base_url = '®ion=R-CITY&city_limit=true&output=json&ak=R-AK' def fun_(city): for district in request_dic[city]: for request_name in request_dic[city][district]: request_name_chk = '%s%s%s' % (city, district, request_name) # gen_requested_file_list('.txt') if request_name_chk in requested_file_list: continue ak = db_get_one_effective() if ak == DB_KEY_EXHAUST: print(DB_KEY_EXHAUST) break else: url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak) try: bd_res_json_str = requests.get(url_).text db_update_one_today_used(ak) write_res_file(request_name_chk, bd_res_json_str) except Exception: bd_res_json_str = '请求百度-异常' write_res_file(request_name_chk, bd_res_json_str, requested_file_dir_exception_str) print(request_name_chk, bd_res_json_str) try: start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2]) except Exception: start_loop, stop_loop = -1, 200 def main(): threads_list, nloop = [], 0 request_dic_city_l = sorted(request_dic, reverse=False) for city in request_dic_city_l: nloop += 1 if nloop < start_loop or nloop > stop_loop: continue thread_instance = MyThread(fun_, (city), fun_.__name__) threads_list.append(thread_instance) for t in threads_list: t.setDaemon = False t.start() for t in threads_list: t.join() if __name__ == '__main__': main()
import time import sys import os curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', [] requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % ( curPath, dir_exception) requested_file_dir = os.listdir(requested_file_dir_str) ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved', 'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] def rm_invalid_file(file_postfix='.txt'): filepath = '%s\%s\' % (curPath, dir_) file_l = os.listdir(filepath) for i in file_l: fdir_o = os.path.join('%s%s' % (filepath, i)) file_postfix_ = os.path.splitext(fdir_o)[1] if file_postfix_ == file_postfix: with open(fdir_o, 'r', encoding='utf-8') as ft: jstr = '' for i in ft: jstr = '%s%s' % (jstr, i) ft.close() for ex in ex_l: if jstr.find(ex) > -1: statinfo = os.stat(fdir_o) strftime_st_ctime = time.strftime("%y%m%d%H%M%S",time.localtime(statinfo.st_ctime)) os.remove(fdir_o) print('remove', ex,strftime_st_ctime, fdir_o) rm_invalid_file()
import xlrd import time import sys import os import requests import sqlite3 import threading curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003' db = 'py_bdspider_status.db' db = '%s\%s' % (curPath, db) def db_chk_one_exist(key): conn = sqlite3.connect(db) c = conn.cursor() sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key) r = 0 res = c.execute(sql).fetchone() if res is not None: r = 1 conn.close return r # def db_init_key_table(): # conn = sqlite3.connect(db) # c = conn.cursor() # k_file = '%s\%s' % (curPath, 'bdmap_key.txt') # with open(k_file, 'r', encoding='utf-8') as pf: # for i in pf: # if len(i) < 4: # continue # author, key = i.replace(' ', '').replace(' ', '').replace(' ', '').split(';') # r = db_chk_one_exist(key) # if r == 0: # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) # sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % ( # author, key, localtime_, 0) # c.execute(sql) # conn.commit() # conn.close() # pf.close() # # # db_init_key_table() def db_recovery_bdkeynum(): if time.strftime("%H%M%S", time.localtime()) == next_day_tag: conn = sqlite3.connect(db) c = conn.cursor() localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s ' % (localtime_) c.execute(sql) conn.commit() conn.close() return def db_get_one_effective(): db_recovery_bdkeynum() conn = sqlite3.connect(db) c = conn.cursor() sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES) res, r = c.execute(sql).fetchone(), '' if res is None: r = DB_KEY_EXHAUST else: r = res[0] conn.close() return r def db_update_one_today_used(key): conn = sqlite3.connect(db) c = conn.cursor() localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % ( localtime_, key) c.execute(sql) conn.commit() conn.close() dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', [] requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % ( curPath, dir_exception) requested_file_dir = os.listdir(requested_file_dir_str) ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved', 'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] def rm_invalid_file(file_postfix='.txt'): filepath = '%s\%s\' % (curPath, dir_) file_l = os.listdir(filepath) for i in file_l: fdir_o = os.path.join('%s%s' % (filepath, i)) file_postfix_ = os.path.splitext(fdir_o)[1] if file_postfix_ == file_postfix: with open(fdir_o, 'r', encoding='utf-8') as ft: jstr = '' for i in ft: jstr = '%s%s' % (jstr, i) ft.close() for ex in ex_l: if jstr.find(ex) > -1: statinfo = os.stat(fdir_o) strftime_st_ctime = time.strftime("%y%m%d%H%M%S",time.localtime(statinfo.st_ctime)) os.remove(fdir_o) print('remove', ex,strftime_st_ctime, fdir_o) rm_invalid_file() def gen_requested_file_list(file_postfix='.html'): filepath = '%s\%s' % (curPath, dir_) pathDir = os.listdir(filepath) for allDir in pathDir: child = os.path.join('%s%s' % (filepath, allDir)) requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0] if requested_file not in requested_file_list: requested_file_list.append(requested_file) file_postfix_l = ['.txt'] for i in file_postfix_l: gen_requested_file_list(i) def gen_file_data(fname_source, file_type='.xlsx'): fname_open = '%s\%s' % (curPath, fname_source) excel_ = '%s%s' % (fname_open, file_type) book = xlrd.open_workbook(excel_, on_demand=True) sheet = book.sheet_by_index(0) data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)] book.release_resources() del book return data request_dic, target_type_list, target_type_except_list = {}, [], [] fname_source = '【SOURCE】采集员新增任务133598条-楼宇归集-互异百度uid数51700' data_selfadd = gen_file_data(fname_source) def replace_illeagl_tag(str_): l = [' ', ' ', ' '] for i in l: str_ = str_.replace(i, '') return str_ # 碧海富通城三期(3栋) ok # =碧海富通城-三期(3栋) ok replace_to_empty_l = [' ', '|', ' ', ' ', '/', '?', '?', '·', '.'] def gen_bd_query_origin_name(name_): for i in replace_to_empty_l: name_ = name_.replace(i, '') return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '') for l in data_selfadd: dbid, area_code, type_, city, district, uid, name_, address, street, request_name, submit_time = l # if city != '深圳市': # continue # if len(uid.replace(' ', '')) > 0: # continue request_name = gen_bd_query_origin_name(name_) request_name_chk = '%s%s%s' % (city, district, request_name) if request_name_chk in requested_file_list: continue if city not in request_dic: request_dic[city] = {} if district not in request_dic[city]: request_dic[city][district] = {} request_dic[city][district] = [] if request_name not in request_dic[city][district]: request_dic[city][district].append(request_name) del data_selfadd fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339' data_jmtool = gen_file_data(fname_source) for l in data_jmtool: dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l # if len(uid.replace(' ', '')) > 0: # continue # if city != '深圳市': # continue request_name = gen_bd_query_origin_name(name_) request_name_chk = '%s%s%s' % (city, district, request_name) if request_name_chk in requested_file_list: continue if city not in request_dic: request_dic[city] = {} if district not in request_dic[city]: request_dic[city][district] = {} request_dic[city][district] = [] if request_name not in request_dic[city][district]: request_dic[city][district].append(request_name) del data_jmtool write_res_file_dir = '%s\%s\' % (curPath, dir_) def write_res_file(input_, str_, dir_=write_res_file_dir, file_postfix='.txt'): for ex in ex_l: if str_.find(ex) > -1: global ak, url_ print('EXCEPTION-', ex, 'AK-', ak, 'URL-', url_) return fname = '%s%s%s' % (dir_, input_, file_postfix) with open(fname, 'w', encoding='utf-8') as ft: ft.write(str_) ft.close() print('ok', threading.get_ident(), input_) class MyThread(threading.Thread): def __init__(self, func, args, name): threading.Thread.__init__(self), self.func, self.args = name, func, args def run(self): self.func(self.args) #瀛嘉天下®ion=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1 base_url = '®ion=R-CITY&city_limit=true&output=json&ak=R-AK' def fun_(city): for district in request_dic[city]: for request_name in request_dic[city][district]: request_name_chk = '%s%s%s' % (city, district, request_name) # gen_requested_file_list('.txt') if request_name_chk in requested_file_list: continue ak = db_get_one_effective() if ak == DB_KEY_EXHAUST: print(DB_KEY_EXHAUST) break else: url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak) try: bd_res_json_str = requests.get(url_).text db_update_one_today_used(ak) write_res_file(request_name_chk, bd_res_json_str) except Exception: bd_res_json_str = '请求百度-异常' write_res_file(request_name_chk, bd_res_json_str, requested_file_dir_exception_str) print(request_name_chk, bd_res_json_str) try: start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2]) except Exception: start_loop, stop_loop = -1, 200 def main(): threads_list, nloop = [], 0 request_dic_city_l = sorted(request_dic, reverse=False) for city in request_dic_city_l: nloop += 1 if nloop < start_loop or nloop > stop_loop: continue thread_instance = MyThread(fun_, (city), fun_.__name__) threads_list.append(thread_instance) for t in threads_list: t.setDaemon = False t.start() for t in threads_list: t.join() if __name__ == '__main__': main()