zoukankan      html  css  js  c++  java
  • 放弃枚举的低效方法 挖掘规律 让程序去学习

     G-->f

    import csv
    import time
    import xlrd
    from openpyxl import Workbook
    
    ZHITONGZI_CITY_DIC = {}
    f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
    ZHITONGZI_CITY_DIC['东莞市'] = []
    ZHITONGZI_CITY_DIC['中山市'] = []
    c = 0
    for i in f:
        ii = i.replace(' ', '').split(';')
        for iii in ii:
            iv = iii.split('、')
            if len(iv) > 2:
                c += 1
                for v in iv:
                    if v.find('(') > -1:
                        v_ = v.split('(')[1]
                    elif v.find(')') > -1:
                        v_ = v.split(')')[0]
                    else:
                        v_ = v
                    if c == 1 or c == 2:
                        ZHITONGZI_CITY_DIC['东莞市'].append(v_)
                    elif c == 3 or c == 4:
                        ZHITONGZI_CITY_DIC['中山市'].append(v_)
    f.closed
    
    zh_num_list = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
    zh_num_zhk_dic = {}
    zh_num_numk_dic = {}
    for i in range(0, 10, 1):
        zh_num = zh_num_list[i]
        zh_num_numk_dic[str(i)] = zh_num
        zh_num_zhk_dic[zh_num] = str(i)
    
    # 天河区	中石化大厦A塔
    nswe_m_list = ['东', '西', '南', '北', '中']
    roman_numerals_12_list = ['Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅴ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅹ', 'Ⅺ', 'Ⅻ']
    arabic_numerals_10_list = [str(i) for i in range(0, 10, 1)]
    postfix_list = ['座', '区', '栋', '楼', '院', '阁', '期', '单元', '号', '塔', '幢', '馆']
    alphabet_list = [chr(i).upper() for i in range(97, 123)]
    
    name_split_list = []
    del_char_list = ['.', '·', '-', ' ']
    del_tail_list = ['第']
    suspect_char_list = ['(', '(']
    name_format_replace_dic = {}
    name_format_replace_dic[' '] = ''
    name_format_replace_dic['+'] = '加'
    gd_paralleling = '|'
    gd_separator = ';'
    diy_join_tag = '||'
    
    for postfix in postfix_list:
        for nswe_m in nswe_m_list:
            str_ = '%s%s' % (nswe_m, postfix)
            name_split_list.append(str_)
        for numeral in roman_numerals_12_list:
            str_ = '%s%s' % (numeral, postfix)
            name_split_list.append(str_)
        for alphabet in alphabet_list:
            str_ = '%s%s' % (alphabet.upper(), postfix)
            name_split_list.append(str_)
        for i in range(0, 9, 1):
            str_ = '%s%s' % (i, postfix)
            name_split_list.append(str_)
    
    
    def zh_num_format(str_):
        global zh_num_zhk_dic
        for i in zh_num_zhk_dic:
            str_ = str_.replace(i, zh_num_zhk_dic[i])
        return str_
    
    
    def replace_zhnum_num(str_):
        for i in zh_num_numk_dic:
            if str_.find(i) > -1:
                str_ = str_.replace(i, zh_num_numk_dic[i])
        return str_
    
    
    def alphabet_upper_format(str_):
        global zh_num_zhk_dic
        for i in alphabet_list:
            str_ = str_.replace(i, i.upper())
        return str_
    
    
    def del_char(str_):
        global del_char_list
        for i in del_char_list:
            str_ = str_.replace(i, '')
        return str_
    
    
    # ART SPACE
    # '虹口SOHO'
    # 6A
    
    def del_tail_filter_list(str_, filter_list):
        len_ = len(str_)
        len__ = len_ - 1
        index_ = len_
        for i in range(len__, 0, -1):
            tail_ = str_[i]
            if tail_ in filter_list:
                index_ = i
            else:
                break
        return str_[0:index_]
    
    
    def del_tail_filter(str_):
        global arabic_numerals_10_list, alphabet_list
        res = del_tail_filter_list(str_, arabic_numerals_10_list)
        res = del_tail_filter_list(res, alphabet_list)
        return res
    
    
    # '上海加华商务中心A9座'
    def name_reduction_format(original_):
        for i in name_format_replace_dic:
            original_ = original_.replace(i, name_format_replace_dic[i])
        if original_.find('(') > -1:
            original_ = original_.split('(')[0]
        if original_.find('(') > -1:
            original_ = original_.split('(')[0]
        if original_.find('-') > -1:
            original_ = original_.split('-')[0]
        if original_.find('、') > -1:
            original_ = original_.split('、')[0]
    
        format_ = original_.upper()
        format_ = del_char(format_)
        format_ = zh_num_format(format_)
        format_ = alphabet_upper_format(format_)
        if len(format_) < MIN_NAME_LEN:
            return original_
        return format_
    
    
    def name_reduction(format_):
        global name_split_list
        reduction_ = name_reduction_format(format_)
        for i in name_split_list:
            index_ = reduction_.find(i)
            if index_ > -1:
                reduction_ = reduction_.split(i)[0]
                # break#佳兆业可园六期2区C座湖西路
        # 12区
        reduction_ = del_tail_filter(reduction_)
        reduction_ = replace_zhnum_num(reduction_)
        if len(reduction_) < MIN_NAME_LEN:
            reduction_ = format_
        for i in del_tail_list:
            if reduction_[-1:] == i:
                reduction_ = reduction_[:-1]
        return reduction_
    
    
    # X大厦(abc
    # abc
    def chk_cross_name(str_, str__):
        if len(str_) > len(str__):
            a = str_
            str_ = str__
            str__ = a
        res = 0
        if str__.find(str_) > -1:
            l = str__.split('(')
            if len(l) > 1:
                if l[1].find(str_) > -1:
                    res = 1
        return res
    
    
    def res_list_str(dic_, dk, filter_list=[',']):
        l = []
        for i in dic_[dk]:
            l.append(str(i))
        str_ = diy_join_tag.join(l)
        for i in filter_list:
            str_ = str_.replace(i, '')
    
        str_ = str_.replace('
    ', '')
        return str_
    
    
    def chk_name_subname(str_, str__):
        if len(str_) > len(str__):
            a = str__
            str__ = str_
            str_ = a
        if str__.split(str_)[0] == '':
            return 0
        return 1
    
    
    def gen_show_addr(l, district):
        len_ = len(l)
        res_ = sorted(l, key=lambda l: len_)[len_ - 1]
        ll = res_.split(district)
        if len(ll) > 1:
            res_ = ll[1].strip()
        return res_
    
    
    def gen_gd_type_single_str(gd_type_list, filter_):
        gd_type_single_str = ''
        for i in gd_type_list:
            if i.find(filter_) > -1:
                if i.find(gd_paralleling) > -1:
                    l = i.split(gd_paralleling)
                    for ii in l:
                        if ii.find(filter_) > -1:
                            gd_type_single_str = ii
                            break
                else:
                    gd_type_single_str = i
                    break
        return gd_type_single_str
    
    
    def gen_show_gd_type_dic(gd_type_list, filter_):
        dic_ = {}
        dic_['gd_type_list_str'] = diy_join_tag.join(gd_type_list)
        dic_['gd_type_0'] = ''
        dic_['gd_type_1'] = ''
        dic_['gd_type_2'] = ''
        if filter_.find('楼宇') > -1:
            filter_ = '楼宇'
        elif filter_.find('住宅小区') > -1:
            filter_ = '住宅小区'
        gd_type_single_str = gen_gd_type_single_str(gd_type_list, filter_)
        dic_['gd_type_0'], dic_['gd_type_1'], dic_['gd_type_2'] = gd_type_single_str.split(gd_separator)
        return dic_
    
    
    def gen_show_gd_type_dic_fromstr(gd_type_str, filter_):
        dic_ = {}
        dic_['gd_type_0'], dic_['gd_type_1'], dic_['gd_type_2'] = '', '', ''
        if filter_.find('楼宇') > -1:
            filter_ = '楼宇'
        elif filter_.find('住宅小区') > -1:
            filter_ = '住宅小区'
    
        gd_type_list_paralleling = gd_type_str.split(gd_paralleling)
        for gd_type in gd_type_list_paralleling:
            if gd_type.find(filter_) > -1:
                dic_['gd_type_0'], dic_['gd_type_1'], dic_['gd_type_2'] = gd_type.split(gd_separator)
        return dic_
    
    
    def compute_list(l):
        sum_ = 0
        for i in l:
            i_ = float(i)
            sum_ += i_
        return sum_ / len(l)
    
    
    def res_list(dic_, dk):
        l = dic_[dk]
        return compute_list(l)
    
    
    target_city_list = []
    FEXCEL = '【商场任务】28个城市_任务列表_20170727.xlsx'
    data = xlrd.open_workbook(FEXCEL)
    table = data.sheets()[0]
    nrows = table.nrows
    ncols = table.ncols
    flag_title = 0
    res_dic = {}
    source_file_line_num = 0
    for i in range(0, nrows):
        source_file_line_num += 1
        l = table.row_values(i)
        if flag_title == 0:
            flag_title = 1
            continue
        city = l[2]
        if city not in target_city_list:
            target_city_list.append(city)
    
    city_zhixiashi_list = ['北京市', '上海市', '天津市', '重庆市']
    filter_city_list = ['北京市', '上海市', '广州市', '深圳市']
    
    file_house = '住宅小区.csv'
    file_bizbuilding = '楼宇.csv'
    file_gen_house = '含北上广深28城市-住宅小区-归约化'
    file_gen_bizbuilding = '含北上广深28城市-商住楼宇-归约化'
    
    file_title_str = 'province,city,district,商圈,商圈类型,归约后的名,name_original,show_addr,show_addr_num,gd_type,gd_type_1,gd_type_2,gd_type_3,locationx,locationy,gpsx,gpsy,bdx,bdy,localtime'
    file_title_str_statistics = '省份,城市,源文件行数,处理后的文件行数,压缩率,参考总建筑区总楼栋数目,总建筑区名字数,参考独栋数,参考独栋数率,参考单建筑区的平均楼栋数目'
    
    MIN_NAME_LEN = 2
    
    
    def data_file_extract(file_name):
        res_dic = {}
        with open(file_name, 'r', encoding='utf-8-sig') as csvfile:
            reader = csv.DictReader(csvfile)
            file_line_num = 1
            for ordered_dic in reader:
                file_line_num += 1
                province = ordered_dic['province']
                city = ordered_dic['city']
                district = ordered_dic['district']
                if city.find('[') > -1:
                    city = province
                if city not in target_city_list:
                    continue
                if province not in res_dic:
                    res_dic[province] = {}
                if city not in res_dic[province]:
                    res_dic[province][city] = {}
                    res_dic[province][city]['source_file_sum_city_district'] = 0
                    res_dic[province][city]['district_dic'] = {}
    
                if city == '东莞市':
                    district_ = ordered_dic['addr'].split('东莞市')[1]
                    district = '松山湖'
                    for tag_ in ZHITONGZI_CITY_DIC['东莞市']:
                        if district_.find(tag_) > -1:
                            district = tag_
    
                if district not in res_dic[province][city]['district_dic']:
                    res_dic[province][city]['district_dic'][district] = {}
                    # {name_reduction:num}
                    res_dic[province][city]['district_dic'][district]['name_reduction_dic'] = {}
                    res_dic[province][city]['district_dic'][district]['dic_list'] = []
                d = ordered_dic
                name_original = d['name']
                name_ = name_reduction(name_original)
                name_ = name_reduction(name_)
                # 水岸阳光B小区b区C幢(C-幢)
                name_ = name_reduction(name_)
                d['file_line_num'] = file_line_num
                d['name_reduction'] = name_
                if name_ not in res_dic[province][city]['district_dic'][district]['name_reduction_dic']:
                    res_dic[province][city]['district_dic'][district]['name_reduction_dic'][name_] = 0
    
                res_dic[province][city]['district_dic'][district]['name_reduction_dic'][name_] += 1
                res_dic[province][city]['source_file_sum_city_district'] += 1
                res_dic[province][city]['district_dic'][district]['dic_list'].append(d)
    
        return res_dic
    
    
    def data_self_reduction(self_dic_):
        for province in self_dic_:
            for city in self_dic_[province]:
                for district in self_dic_[province][city]['district_dic']:
                    name_reduction_dic = self_dic_[province][city]['district_dic'][district][
                        'name_reduction_dic']
                    dic_list = self_dic_[province][city]['district_dic'][district]['dic_list']
    
                    name_reduction_list = sorted(name_reduction_dic, reverse=False)
                    for name_reduction in name_reduction_list:
                        for i in dic_list:
                            name_reduction_order = i['name_reduction']
                            if name_reduction_order == name_reduction:
                                continue
                            longer_, shorter_ = name_reduction_order, name_reduction
                            if len(name_reduction_order) < len(name_reduction):
                                shorter_, longer_ = name_reduction_order, name_reduction
                            if longer_.find(shorter_) > -1:
                                if shorter_ in self_dic_[province][city]['district_dic'][district][
                                    'name_reduction_dic']:
                                    self_dic_[province][city]['district_dic'][district][
                                        'name_reduction_dic'][shorter_] += 1
                                if longer_ in self_dic_[province][city]['district_dic'][district][
                                    'name_reduction_dic']:
                                    del self_dic_[province][city]['district_dic'][district][
                                        'name_reduction_dic'][longer_]
    
                    name_reduction_list = sorted(name_reduction_dic, reverse=True)
                    for name_reduction in name_reduction_list:
                        for i in dic_list:
                            name_reduction_order = i['name_reduction']
                            if name_reduction_order == name_reduction:
                                continue
                            longer_, shorter_ = name_reduction_order, name_reduction
                            if len(name_reduction_order) < len(name_reduction):
                                shorter_, longer_ = name_reduction_order, name_reduction
                            if longer_.find(shorter_) > -1:
                                if shorter_ in self_dic_[province][city]['district_dic'][district][
                                    'name_reduction_dic']:
                                    self_dic_[province][city]['district_dic'][district][
                                        'name_reduction_dic'][shorter_] += 1
                                if longer_ in self_dic_[province][city]['district_dic'][district][
                                    'name_reduction_dic']:
                                    del self_dic_[province][city]['district_dic'][district][
                                        'name_reduction_dic'][longer_]
    
        return self_dic_
    
    
    def gen_file(data_file_reduction_dic, file_name, file_title_str, file_title_str_statistics):
        wb = Workbook()
        worksheet = wb.active
        file_title_str = file_title_str.replace(' ', '')
        worksheet.append(file_title_str.split(','))
        name_reduction_all_num, row_original_all_num, row_res_all_num, name_reduction_single_all_num = 0, 0, 0, 0
    
        wb_statistics = Workbook()
        worksheet_statistics = wb_statistics.active
        worksheet_statistics.append(file_title_str_statistics.replace(' ', '').split(','))
    
        for province in data_file_reduction_dic:
            for city in data_file_reduction_dic[province]:
                name_reduction_num, row_original_num, row_res_num = 0, 0, 0
                name_reduction_single_num_l = []
                for district in data_file_reduction_dic[province][city]['district_dic']:
    
                    name_reduction_dic = data_file_reduction_dic[province][city]['district_dic'][district][
                        'name_reduction_dic']
                    dic_list = data_file_reduction_dic[province][city]['district_dic'][district]['dic_list']
                    name_reduction_num += len(name_reduction_dic)
                    row_original_num += len(dic_list)
                    for name_reduction in name_reduction_dic:
                        for i in dic_list:
                            name_reduction_order = i['name_reduction']
                            if name_reduction_order != name_reduction:
                                continue
                            if data_file_reduction_dic[province][city]['district_dic'][district][
                                'name_reduction_dic'][name_reduction] == 1:
                                if name_reduction not in name_reduction_single_num_l:
                                    name_reduction_single_num_l.append(name_reduction)
                            name_original = i['name']
                            name_format = name_reduction_format(name_original)
                            if name_format in name_reduction_dic and name_reduction_dic[name_format] > 1:
                                continue
                            gd_type, locationx, locationy, gpsx, gpsy, bdx, bdy = i['type'], i['locationx'], i['locationy'], 
                                                                                  i['gpsx'], i['gpsy'], i['bdx'], i['bdy']
                            show_gd_type_dic = gen_show_gd_type_dic_fromstr(gd_type, file_name)
                            show_addr_num = '%s%s' % (i['street'], i['number'])
                            show_addr = show_addr_num
                            if len(i['address'].strip()) > 2:
                                show_addr = i['address']
                            row_res_num += 1
                            localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
                            xlsx_list = [province, city, district, 'todo', 'todo', name_reduction, name_original, show_addr,
                                         show_addr_num, gd_type, show_gd_type_dic['gd_type_0'],
                                         show_gd_type_dic['gd_type_1'], show_gd_type_dic[
                                             'gd_type_2'], locationx, locationy, gpsx, gpsy, bdx, bdy,
                                         localtime_]
                            worksheet.append(xlsx_list)
    
                name_reduction_single_num = len(name_reduction_single_num_l)
                # file_title_str1 = '省份,城市,源文件行数,处理后的文件行数,压缩率,参考总建筑区总楼栋数目,总建筑区名字数,参考独栋数,参考独栋数率,参考单建筑区的平均楼栋数目'
    
                xlsx_list_statistics = [province, city, row_original_num, row_res_num, row_res_num / row_original_num,
                                        row_res_num,
                                        name_reduction_num, name_reduction_single_num,
                                        name_reduction_single_num / row_res_num,
                                        row_res_num / name_reduction_num]
                worksheet_statistics.append(xlsx_list_statistics)
    
                row_original_all_num += row_original_num
                row_res_all_num += row_res_num
                name_reduction_all_num += name_reduction_num
                row_original_all_num += row_original_num
                name_reduction_single_all_num += name_reduction_single_num
        xlsx_list_statistics = ['ALL', 'ALL', row_original_all_num, row_res_all_num, row_res_all_num / row_original_all_num,
                                row_res_all_num,
                                name_reduction_all_num, name_reduction_single_all_num,
                                name_reduction_single_all_num / row_res_all_num,
                                row_res_all_num / name_reduction_all_num]
        worksheet_statistics.append(xlsx_list_statistics)
        file_name_save = '%s%s%s' % (file_name, localtime_, '-统计.xlsx')
        wb_statistics.save(file_name_save)
        file_name_save = '%s%s%s' % (file_name, localtime_, '.xlsx')
        wb.save(file_name_save)
    
    
    data_file_extract_house = data_file_extract(file_house)
    data_file_extract_bizbuilding = data_file_extract(file_bizbuilding)
    
    data_self_reduction_house = data_self_reduction(data_file_extract_house)
    data_self_reduction_bizbuilding = data_self_reduction(data_file_extract_bizbuilding)
    
    gen_file(data_self_reduction_house, file_gen_house, file_title_str, file_title_str_statistics)
    gen_file(data_self_reduction_bizbuilding, file_gen_bizbuilding, file_title_str, file_title_str_statistics)
    

      

  • 相关阅读:
    中文词语的语法划分
    程序员转行可以做什么?
    Linux Crontab 定时任务 命令详解
    Spring对AOP的支持
    ASP.NET 2.0 Web Part编程入门
    linux ramdom hung up
    VLC plugin加载代码分析
    mac上的ssh proxy客户端 iSSH个人修改版
    关于MP4 fileformat中 duration及timescale相关的几个地方
    inline 小结
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7359661.html
Copyright © 2011-2022 走看看