from openpyxl import Workbook import xlrd import time import Levenshtein as Le target_city_list = ['北京市', '上海市', '深圳市', '广州市'] source_name = 'JMTool任务_csv_py_wholeCSV-加百度170826165729' BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|' FEXCEL = '%s%s' % (source_name, '.xlsx') weight_ratio, weight_seqratio = 0.7, 0.3 def main_(): global source_name data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows, ncols = table.nrows, table.ncols res_dic = {} for i in range(0, nrows): l = table.row_values(i) dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list = l if dbid == 'dbid': continue if city not in target_city_list: continue if city not in res_dic: res_dic[city] = {} if district not in res_dic[city]: res_dic[city][district] = {} if name_ not in res_dic[city][district]: res_dic[city][district][name_] = [] if BDpoi_list.find(BDpoi_list_tag) == -1: ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, '', '', '' res_dic[city][district][name_].append(ll) else: addr_ = '%s%s%s%s' % (city, district, address, city_street) chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_, addr_], BDpoi_list.split( BDpoi_list_tag), {} for ii in cmp_list: if len(ii) == 0: continue cmp_, BD_name, BD_addr = ['', ''], '', '' cmp_one = ii.split(BDpoi_list_tagb) if len(cmp_one) == 2: # format data -fair BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '') else: BD_name = cmp_[0] = cmp_one[0] ratio_res, seqratio_res = Le.ratio(name_, BD_name), Le.seqratio(chk_name_lsit, cmp_) ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic: sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = [] sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll) sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic) for ratio_seqratio_res in sorted_seqratio_res_list: lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res] for vl in lll: res_dic[city][district][name_].append(vl) wb = Workbook() worksheet = wb.active file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res' file_title_l = file_title_str.replace(' ', '').split(',') worksheet.append(file_title_l) for city in res_dic: for district in res_dic[city]: for name_ in res_dic[city][district]: l = res_dic[city][district][name_] for ll in l: worksheet.append(ll) localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_) file_name_save = '%s%s' % (file_name, '.xlsx') wb.save(file_name_save) wb = Workbook() worksheet = wb.active file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res' file_title_l = file_title_str.replace(' ', '').split(',') worksheet.append(file_title_l) for city in res_dic: for district in res_dic[city]: for name_ in res_dic[city][district]: l = res_dic[city][district][name_] lll = l[-1] worksheet.append(lll) localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_) file_name_save = '%s%s' % (file_name, '.xlsx') wb.save(file_name_save) main_()
from openpyxl import Workbook import xlrd import time import Levenshtein as Le target_city_list = ['深圳市'] BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|' source_name = 'JMTool任务_csv_py_wholeCSV_住宅小区-加百度170826152533' FEXCEL = '%s%s' % (source_name, '.xlsx') weight_ratio, weight_seqratio = 0.7, 0.3 def main_(): global source_name data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows, ncols = table.nrows, table.ncols res_dic = {} for i in range(0, nrows): l = table.row_values(i) dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list = l if dbid == 'dbid': continue if city not in target_city_list: continue if city not in res_dic: res_dic[city] = {} if district not in res_dic[city]: res_dic[city][district] = {} if name_ not in res_dic[city][district]: res_dic[city][district][name_] = [] if BDpoi_list.find(BDpoi_list_tag) == -1: ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, '', '', '' res_dic[city][district][name_].append(ll) else: addr_ = '%s%s%s%s' % (city, district, address, city_street) chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_reduction, addr_], BDpoi_list.split( BDpoi_list_tag), {} for ii in cmp_list: if len(ii) == 0: continue cmp_, BD_name, BD_addr = ['', ''], '', '' cmp_one = ii.split(BDpoi_list_tagb) if len(cmp_one) == 2: # format data -fair BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '') else: BD_name = cmp_[0] = cmp_one[0] ratio_res, seqratio_res = Le.ratio(name_reduction, BD_name), Le.seqratio(chk_name_lsit, cmp_) ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic: sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = [] sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll) sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic) for ratio_seqratio_res in sorted_seqratio_res_list: lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res] for vl in lll: res_dic[city][district][name_].append(vl) wb = Workbook() worksheet = wb.active file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res' file_title_l = file_title_str.replace(' ', '').split(',') worksheet.append(file_title_l) for city in res_dic: for district in res_dic[city]: for name_ in res_dic[city][district]: l = res_dic[city][district][name_] for ll in l: worksheet.append(ll) localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_) file_name_save = '%s%s' % (file_name, '.xlsx') wb.save(file_name_save) wb = Workbook() worksheet = wb.active file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res' file_title_l = file_title_str.replace(' ', '').split(',') worksheet.append(file_title_l) for city in res_dic: for district in res_dic[city]: for name_ in res_dic[city][district]: l = res_dic[city][district][name_] lll = l[-1] worksheet.append(lll) localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_) file_name_save = '%s%s' % (file_name, '.xlsx') wb.save(file_name_save) main_()