zoukankan      html  css  js  c++  java
  • 特征列 属性值 获取 vowpal wabbit 生成DNN 的训练测试数据

    用户特征文件 userFeature.data 每 行 代 表 一 个 用 户 的 特 征 数 据, 格 式 为: “uid|features”,uid 和 features 用竖线“|”分隔。其中 feature 采用 vowpal wabbit(https://github.com/JohnLangford/vowpal_wabbit)格式: “feature_group1|feature_group2|feature_group3|...”。 每 个 feature_group 代表一个特征组,多个特征组之间也以竖线“|”分隔。一个特征组若包括多个值 则以空格分隔,格式为:“feature_group_name fea_name1 fea_name2 …”, 其中 fea_name 采用数据编号的格式。

    特征列  属性值      获取

    d = {}
    with open(f, 'r') as fr:
        for i in fr:
            l = i.split('|')
            for ii in l:
                ll=ii.split(' ')
                k=ll[0]
                if k not in d:
                    d[k]=[]
                for iii in ll[1:]:
                    iii_=int(iii)
                    if int(iii) not in d[k]:
                        d[k].append(iii_)
    for k in d:
        l=sorted(d[k],reverse=False)
        print(k)
        print(l)
    

      

    批处理 减小运行时间

    d, reduce_chk_counter, reduce_chk_step = {}, 0, 500000
    with open(f, 'r') as fr:
        for i in fr:
            l = i.split('|')
    
            for ii in l:
                ll = ii.split(' ')
                k = ll[0]
                if k not in d:
                    d[k] = []
                for iii in ll[1:]:
                    iii_ = int(iii)
                    d[k].append(iii_)
    
            reduce_chk_counter += 1
            if reduce_chk_counter == reduce_chk_step:
                reduce_chk_counter = 0
                for k in d:
                    d[k] = list(set(d[k]))
                    l = sorted(d[k], reverse=False)
                    print(k)
                    print(l)
    
    for k in d:
        d[k] = list(set(d[k]))
        l = sorted(d[k], reverse=False)
        print(k)
        print(l)
    
    res_f = 'toknowit.txt'
    with open(res_f, 'w') as fw:
        for k in d:
            l = sorted(d[k], reverse=False)
            print(k)
            print(l)
            s = '{}
    {}
    '.format(k, ','.join(l))
            fw.write(s)
    

      

    将数据冗余至内存,批处理,去重,释放内存

    f = 'userFeature.data'
    # 确定marriageStatus特征列的值
    d, d1 = {}, {}
    with open(f, 'r') as fr:
        for i in fr:
            break
            l = i.split('|')
            for ii in l:
                if 'marriageStatus' in ii:
                    k = len(ii)
                    d[k] = ii
                    k = ii.split('marriageStatus')[-1]
                    d1[k] = ii
    for k in d:
        print(k, d[k])
    for k in d1:
        print(k, d1[k])
    
    '''
    17 marriageStatus 11
    19 marriageStatus 2 13
    20 marriageStatus 13 10
    16 marriageStatus 0
    21 marriageStatus 2 13 9
    22 marriageStatus 12 13 9
    23 marriageStatus 12 13 10
    
     11 marriageStatus 11
     5 13 marriageStatus 5 13
     13 10 marriageStatus 13 10
     10 marriageStatus 10
     15 marriageStatus 15
     0 marriageStatus 0
     13 15 marriageStatus 13 15
     12 13 marriageStatus 12 13
     13 marriageStatus 13
     6 13 marriageStatus 6 13
     2 13 marriageStatus 2 13
     13 9 marriageStatus 13 9
     6 13 9 marriageStatus 6 13 9
     2 13 9 marriageStatus 2 13 9
     5 13 9 marriageStatus 5 13 9
     12 13 9 marriageStatus 12 13 9
     14 marriageStatus 14
     12 13 10 marriageStatus 12 13 10
     3 marriageStatus 3
     15 10 marriageStatus 15 10
     8 marriageStatus 8
     6 13 10 marriageStatus 6 13 10
     5 13 10 marriageStatus 5 13 10
     13 10 9 marriageStatus 13 10 9
     13 15 10 marriageStatus 13 15 10
     2 13 10 marriageStatus 2 13 10
     
     
     marriageStatus 0 2 5 6 8 9 10 11 12 13 15
    '''
    
    d = {}
    with open(f, 'r') as fr:
        for i in fr:
            break
            l = i.split('|')
            for ii in l:
                ll = ii.split(' ')
                k = ll[0]
                if k not in d:
                    d[k] = []
                for iii in ll[1:]:
                    iii_ = int(iii)
                    if int(iii) not in d[k]:
                        d[k].append(iii_)
    
    d, reduce_chk_counter, reduce_chk_step = {}, 0, 500000
    with open(f, 'r') as fr:
        for i in fr:
    
            break
    
            l = i.split('|')
    
            for ii in l:
                ll = ii.split(' ')
                k = ll[0]
                if k == 'uid':
                    continue
                if k not in d:
                    d[k] = []
                for iii in ll[1:]:
                    iii_ = int(iii)
                    d[k].append(iii_)
    
            reduce_chk_counter += 1
            if reduce_chk_counter == reduce_chk_step:
                reduce_chk_counter = 0
                for k in d:
                    d[k] = list(set(d[k]))
                    l = sorted(d[k], reverse=False)
                    print(k)
                    print(l)
    
    for k in d:
        break
    
        d[k] = list(set(d[k]))
        l = sorted(d[k], reverse=False)
        print(k)
        print(l)
    
    res_f = 'toknowit.txt'
    # with open(res_f, 'w') as fw:
    #     for k in d:
    #         l = sorted(d[k], reverse=False)
    #         print(k)
    #         print(l)
    #         s = '{}
    {}
    '.format(k, ','.join([str(i) for i in l]))
    #         fw.write(s)
    cut_l = []
    with open(res_f, 'r') as fr:
        for i in fr:
            l = i.replace('
    ', '').split(',')[0:200]
            cut_l.append(l)
    
    res_f_cut = 'toknowitCUT.txt'
    
    with open(res_f_cut, 'w') as fw:
        for l in cut_l:
            s = '{}
    '.format(','.join([str(i) for i in l]))
            fw.write(s)
    age
    0,1,2,3,4,5
    gender
    0,1,2
    marriageStatus
    0,2,3,5,6,8,9,10,11,12,13,14,15
    education
    0,1,2,3,4,5,6,7
    consumptionAbility
    0,1,2
    LBS
    0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,23,25,26,27,29,30,31,32,33,35,36,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,60,61,62,63,64,65,66,67,68,69,70,71,72,73,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,91,92,94,95,97,98,99,100,101,102,103,104,105,106,107,108,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,135,137,138,139,142,143,144,145,146,147,149,150,151,152,153,154,155,157,158,159,160,161,163,165,168,170,171,172,173,174,175,176,177,178,179,180,181,183,184,185,186,188,189,190,191,192,193,194,195,197,198,199,200,201,202,203,204,206,208,209,210,211,212,214,215,216,217,218,219,220,222,223,224,225,227,229,232,233,234,235,236
    interest1
    1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122
    interest2
    1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,78,79,80,81,82
    interest5
    1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136
    kw1
    2,3,8,13,17,19,21,28,29,39,41,42,43,46,56,59,65,68,69,70,71,72,74,86,87,88,90,92,95,100,101,105,106,109,111,112,113,119,121,123,125,131,133,136,139,141,142,143,145,150,152,156,157,162,163,166,169,172,173,174,176,177,180,181,183,184,185,186,191,199,203,204,209,211,214,216,230,235,240,242,243,246,249,260,263,265,268,269,271,272,278,279,283,284,289,291,292,295,302,303,304,307,313,317,321,322,323,331,336,341,343,344,351,354,357,358,359,366,367,369,370,372,373,375,376,377,378,380,381,382,390,391,393,396,401,402,406,407,408,409,411,414,417,423,429,433,434,437,438,441,442,443,449,456,464,465,468,472,473,475,477,478,480,482,485,486,487,495,496,497,504,506,507,511,513,521,522,526,532,536,541,542,546,547,560,561,563,566,567,575,576,578,581,584,588,592,594,604,605,610
    kw2
    2,6,7,9,10,11,12,14,21,22,23,25,26,30,34,38,40,41,42,43,44,46,47,50,55,56,62,63,66,69,70,71,72,74,75,76,77,78,80,81,84,85,87,89,90,91,94,95,100,112,114,116,117,118,119,121,123,124,127,128,129,130,133,135,137,142,143,144,148,149,151,153,154,156,157,158,163,168,171,174,176,177,180,183,184,186,192,193,195,196,197,200,202,203,215,216,217,219,221,223,228,229,235,237,238,240,241,246,248,250,255,258,260,263,266,269,272,275,276,278,280,286,287,290,294,295,296,297,301,302,303,305,313,317,321,323,327,330,333,334,338,339,340,341,343,344,345,347,354,358,359,363,366,368,369,371,374,375,377,378,380,383,384,386,391,393,394,395,398,399,400,403,404,405,408,409,412,413,417,418,422,427,433,436,438,440,442,445,447,448,451,453,454,455,456,457,459,461,462,463
    topic1
    0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
    topic2
    0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
    ct
    0,1,2,3,4
    os
    0,1,2
    carrier
    0,1,2,3
    house
    1
    interest3
    1,2,3,4,5,6,7,8,9,10
    kw3
    1,7,8,10,15,19,25,27,29,36,50,56,63,68,69,74,77,80,88,93,95,101,117,122,123,124,126,132,133,136,138,149,151,152,153,155,157,164,167,171,173,174,181,186,188,190,192,194,197,198,206,209,213,223,228,233,235,248,249,253,263,273,276,278,280,286,288,295,302,303,311,314,316,323,328,331,332,333,336,343,349,362,364,366,370,372,381,385,391,394,399,401,404,411,412,416,420,425,427,431,453,459,464,465,469,470,474,488,499,504,505,508,512,513,523,530,531,534,539,549,559,560,563,566,568,570,574,581,586,588,598,607,610,617,627,630,633,634,635,636,638,645,650,654,655,657,663,668,676,677,681,685,686,687,691,692,694,695,696,699,701,703,705,707,709,719,722,723,725,734,735,737,739,740,742,745,751,755,763,764,769,771,780,785,788,799,800,805,809,818,821,833,835,836,840,851,853,856,860,862
    topic3
    1,3,4,8,10,11,14,16,18,19,21,22,23,24,25,27,28,30,31,32,33,34,35,37,39,42,43,44,46,47,49,51,53,54,55,56,58,59,60,62,63,65,66,68,69,70,72,75,76,78,79,81,84,87,88,90,92,95,98,99,100,101,102,103,107,108,109,111,112,113,115,116,117,119,120,121,123,124,126,127,129,130,132,133,136,137,138,139,141,142,143,146,148,150,151,154,157,158,159,161,162,164,165,166,167,168,169,171,174,176,177,178,180,182,183,185,186,187,188,190,191,192,193,194,197,198,199,201,202,205,206,207,209,210,211,212,213,214,215,216,217,218,219,220,221,223,226,227,228,232,233,234,235,237,238,240,241,243,251,252,253,255,256,258,259,260,262,264,265,266,267,268,269,270,271,272,273,274,275,278,279,280,282,283,285,287,288,292,297,298,299,301,304,305,306,307,308,309,312,314
    interest4
    1,2,3,4,5,6,7,8,9,10
    appIdAction
    1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200
    appIdInstall
    1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200
    

      

    res_f_cut = 'toknowitCUT.txt'
    
    # with open(res_f_cut, 'w') as fw:
    #     s = '
    '.join([','.join([str(i) for i in l]) for l in cut_l])
    #     fw.write(s)
    
    sparse_num_drop_max, is_odd_line = 50, True
    with open(res_f_cut, 'r') as fr:
        for i in fr:
            l = i.replace('
    ', '').split(',')
            if is_odd_line == True:
                is_odd_line = False
                k = l[0]
            else:
                is_odd_line = True
                if len(l) <= sparse_num_drop_max:
                    for ii in l:
                        s = '{}_{}'.format(k, ii)
                        print(s)
                else:
                    print(k)
    

      

    age_0
    age_1
    age_2
    age_3
    age_4
    age_5
    gender_0
    gender_1
    gender_2
    marriageStatus_0
    marriageStatus_2
    marriageStatus_3
    marriageStatus_5
    marriageStatus_6
    marriageStatus_8
    marriageStatus_9
    marriageStatus_10
    marriageStatus_11
    marriageStatus_12
    marriageStatus_13
    marriageStatus_14
    marriageStatus_15
    education_0
    education_1
    education_2
    education_3
    education_4
    education_5
    education_6
    education_7
    consumptionAbility_0
    consumptionAbility_1
    consumptionAbility_2
    LBS
    interest1
    interest2
    interest5
    kw1
    kw2
    topic1
    topic2
    ct_0
    ct_1
    ct_2
    ct_3
    ct_4
    os_0
    os_1
    os_2
    carrier_0
    carrier_1
    carrier_2
    carrier_3
    house_1
    interest3_1
    interest3_2
    interest3_3
    interest3_4
    interest3_5
    interest3_6
    interest3_7
    interest3_8
    interest3_9
    interest3_10
    kw3
    topic3
    interest4_1
    interest4_2
    interest4_3
    interest4_4
    interest4_5
    interest4_6
    interest4_7
    interest4_8
    interest4_9
    interest4_10
    appIdAction
    appIdInstall
    

      

     

    ['uid', 'age', 'gender', 'marriageStatus', 'education', 'consumptionAbility', 'LBS', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os', 'carrier', 'house']
    
    ['age_0', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'gender_0', 'gender_1', 'gender_2', 'marriageStatus_0', 'marriageStatus_2', 'marriageStatus_3', 'marriageStatus_5', 'marriageStatus_6', 'marriageStatus_8', 'marriageStatus_9', 'marriageStatus_10', 'marriageStatus_11', 'marriageStatus_12', 'marriageStatus_13', 'marriageStatus_14', 'marriageStatus_15', 'education_0', 'education_1', 'education_2', 'education_3', 'education_4', 'education_5', 'education_6', 'education_7', 'consumptionAbility_0', 'consumptionAbility_1', 'consumptionAbility_2', 'LBS_0', 'interest1_0', 'interest2_0', 'interest3_1', 'interest3_2', 'interest3_3', 'interest3_4', 'interest3_5', 'interest3_6', 'interest3_7', 'interest3_8', 'interest3_9', 'interest3_10', 'interest4_1', 'interest4_2', 'interest4_3', 'interest4_4', 'interest4_5', 'interest4_6', 'interest4_7', 'interest4_8', 'interest4_9', 'interest4_10', 'interest5_0', 'kw1_0', 'kw2_0', 'kw3_0', 'topic1_0', 'topic2_0', 'topic3_0', 'appIdInstall_0', 'appIdAction_0', 'ct_0', 'ct_1', 'ct_2', 'ct_3', 'ct_4', 'os_0', 'os_1', 'os_2', 'carrier_0', 'carrier_1', 'carrier_2', 'carrier_3', 'house_1']
    

      

    f = 'userFeature.data'
    '''
    17 marriageStatus 11
    19 marriageStatus 2 13
    20 marriageStatus 13 10
    16 marriageStatus 0
    21 marriageStatus 2 13 9
    22 marriageStatus 12 13 9
    23 marriageStatus 12 13 10
    
     11 marriageStatus 11
     5 13 marriageStatus 5 13
     13 10 marriageStatus 13 10
     10 marriageStatus 10
     15 marriageStatus 15
     0 marriageStatus 0
     13 15 marriageStatus 13 15
     12 13 marriageStatus 12 13
     13 marriageStatus 13
     6 13 marriageStatus 6 13
     2 13 marriageStatus 2 13
     13 9 marriageStatus 13 9
     6 13 9 marriageStatus 6 13 9
     2 13 9 marriageStatus 2 13 9
     5 13 9 marriageStatus 5 13 9
     12 13 9 marriageStatus 12 13 9
     14 marriageStatus 14
     12 13 10 marriageStatus 12 13 10
     3 marriageStatus 3
     15 10 marriageStatus 15 10
     8 marriageStatus 8
     6 13 10 marriageStatus 6 13 10
     5 13 10 marriageStatus 5 13 10
     13 10 9 marriageStatus 13 10 9
     13 15 10 marriageStatus 13 15 10
     2 13 10 marriageStatus 2 13 10
     
     marriageStatus 0 2 5 6 8 9 10 11 12 13 15
    '''
    
    
    def fw_s(f, s):
        with open(f, 'w') as fw:
            fw.write(s)
    
    
    # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征
    # k_oreder_l 获取一级特征的顺序
    
    
    res_f = 'toknowit.txt'
    res_f_k_order = res_f.replace('.', 'KeyOrder.')
    res_f_cut, children_val_max = res_f.replace('.', 'Cut.'), 50
    
    to_write = True
    to_write = False
    if to_write:
        d, reduce_chk_counter, reduce_chk_step, k_oreder_l = {}, 0, 500000, []
        with open(f, 'r') as fr:
            for i in fr:
                l = i.replace('
    ', '').split('|')
                k_order_l_this = []
                for ii in l:
                    ll = ii.split(' ')
                    k = ll[0]
                    k_order_l_this.append(k)
                    if k == 'uid':
                        continue
                    if k not in d:
                        d[k] = []
                    # order -->int
                    for iii in ll[1:]:
                        d[k].append(int(iii))
                k_oreder_l.append(k_order_l_this)
                reduce_chk_counter += 1
                print(reduce_chk_counter)
                if reduce_chk_counter % reduce_chk_step == 0:
                    # reduce_chk_counter = 0
                    for k in d:
                        d[k] = list(set(d[k]))
                        k_oreder_l = [e for i, e in enumerate(k_oreder_l) if
                                      k_oreder_l.index(e) == i]  # set()  TypeError  unhashable type:'list'
    
        for k in d:
            d[k] = sorted(list(set(d[k])), reverse=False)
    
        k_oreder_l = [e for i, e in enumerate(k_oreder_l) if
                      k_oreder_l.index(e) == i]
    
        s = '
    '.join(['{}
    {}'.format(k, ','.join([str(i) for i in d[k]])) for k in d])
        fw_s(res_f, s)
    
        s = '
    '.join(['{}
    {}'.format(k, ','.join([str(i) for i in d[k][0:children_val_max]])) for k in d])
        fw_s(res_f_cut, s)
    
        s = '
    '.join(['|'.join(l) for l in k_oreder_l])
        fw_s(res_f_k_order, s)
    
    with open(res_f_k_order, 'r') as fr:
        ori_feature_l = [i.replace('
    ', '').split('|') for i in fr]
    feature_after_e_d = {}
    for l in ori_feature_l:
        for e in l:
            if e not in feature_after_e_d:
                feature_after_e_d[e] = []
            feature_after_e_d[e] += l[l.index(e) + 1:]
            feature_after_e_d[e] = list(set(feature_after_e_d[e]))
    
    feature_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)]
    print(feature_l)
    
    import re
    
    feature_reduce_l = [i if re.search('d', i) is None else i[0:re.search('d', i).endpos - 1] for i in feature_l]
    # set  破坏了顺序
    print(feature_reduce_l)
    print(list(set(feature_reduce_l)))
    
    feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i]
    print(feature_reduce_l)
    
    sparse_num_drop_max, is_odd_line = 20, True
    # 特征的属性值
    res_d = {}
    with open(res_f_cut, 'r') as fr:
        for i in fr:
            l = i.replace('
    ', '').split(',')
            if is_odd_line == True:
                is_odd_line = False
                k = l[0]
                res_d[k] = []
            else:
                is_odd_line = True
                if len(l) <= sparse_num_drop_max:
                    for ii in l:
                        res_d[k].append(ii)
                else:
                    res_d[k].append(0)
    feature_expand_l = []
    
    feature_l.pop(feature_l.index('uid'))
    for k in feature_l:
        feature_expand_l += ['{}_{}'.format(k, i) for i in res_d[k]]
    
    print(feature_expand_l)
    
    dd = 5
    
    import re, time
    
    f = 'userFeature.data'
    
    '''
    17 marriageStatus 11
    19 marriageStatus 2 13
    20 marriageStatus 13 10
    16 marriageStatus 0
    21 marriageStatus 2 13 9
    22 marriageStatus 12 13 9
    23 marriageStatus 12 13 10
    
     11 marriageStatus 11
     5 13 marriageStatus 5 13
     13 10 marriageStatus 13 10
     10 marriageStatus 10
     15 marriageStatus 15
     0 marriageStatus 0
     13 15 marriageStatus 13 15
     12 13 marriageStatus 12 13
     13 marriageStatus 13
     6 13 marriageStatus 6 13
     2 13 marriageStatus 2 13
     13 9 marriageStatus 13 9
     6 13 9 marriageStatus 6 13 9
     2 13 9 marriageStatus 2 13 9
     5 13 9 marriageStatus 5 13 9
     12 13 9 marriageStatus 12 13 9
     14 marriageStatus 14
     12 13 10 marriageStatus 12 13 10
     3 marriageStatus 3
     15 10 marriageStatus 15 10
     8 marriageStatus 8
     6 13 10 marriageStatus 6 13 10
     5 13 10 marriageStatus 5 13 10
     13 10 9 marriageStatus 13 10 9
     13 15 10 marriageStatus 13 15 10
     2 13 10 marriageStatus 2 13 10
    
     marriageStatus 0 2 5 6 8 9 10 11 12 13 15
    '''
    
    
    def fw_s(f, s):
        with open(f, 'w') as fw:
            fw.write(s)
    
    
    # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征
    # feature_order_l 获取一级特征的顺序
    
    # 原始数据 特征挖掘
    # 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序,
    f_feature = 'toknowit.txt'
    f_feature_ele_num = f_feature.replace('.', 'EleNum.')
    f_feature_incomplete, f_feature_complete = f_feature.replace('.', 'Incomplete.'), f_feature.replace('.',
                                                                                                        'Complete.')
    
    # 原始数据 加工成生产数据
    # 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定
    f_reduce, f_output = f_feature.replace('.', 'Reduce.'), f_feature.replace('.', 'Output.')
    
    # <---以上生成的文件无参数,数据恒定
    
    
    # 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳
    
    f_extend = f_feature.replace('.', 'Extend{}.'.format(int(time.time())))
    to_write_immutable_file = True
    # to_write_immutable_file = False
    if to_write_immutable_file:
        feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000
    
        # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合
    
        with open(f, 'r') as fr:
            for i in fr:
                l = i.replace('
    ', '').split('|')
                feature_incomplete_rows_l_this = []
                for ii in l:
                    ll = ii.split(' ')
                    k = ll[0]
                    feature_incomplete_rows_l_this.append(k)
                    if k == 'uid':
                        continue
                    if k not in feature_d:
                        feature_d[k] = []
                    # order -->int
                    for iii in ll[1:]:
                        feature_d[k].append(int(iii))
                feature_incomplete_rows_l.append(feature_incomplete_rows_l_this)
                reduce_chk_counter += 1
                print(reduce_chk_counter)
                if reduce_chk_counter % reduce_chk_step == 0:
                    # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数
                    for k in feature_d:
                        feature_d[k] = list(set(feature_d[k]))
                        feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                                     feature_incomplete_rows_l.index(e) == i]
                    # subset TEST
                    break
    
        for k in feature_d:
            feature_d[k] = sorted(list(set(feature_d[k])), reverse=False)
    
        feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                     feature_incomplete_rows_l.index(e) == i]
    
        s = '
    '.join([','.join(l) for l in feature_incomplete_rows_l])
        fw_s(f_feature_incomplete, s)
    
        feature_after_e_d = {}
        for l in feature_incomplete_rows_l:
            for e in l:
                if e not in feature_after_e_d:
                    feature_after_e_d[e] = []
                feature_after_e_d[e] += l[l.index(e) + 1:]
                feature_after_e_d[e] = list(set(feature_after_e_d[e]))
    
        # 原始一级特征b
        feature_complete_l = [k for k in
                              sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)]
        print(feature_complete_l)
    
        s = '
    '.join(feature_complete_l)
        fw_s(f_feature_complete, s)
    
        print(feature_complete_l)
    
        feature_d_ = {}
        for feature in feature_complete_l:
            if feature == 'uid':
                continue
            feature_d_[feature] = feature_d[feature]
        del feature_d
        feature_d = feature_d_
    
        s = '
    '.join(['{}
    {}'.format(k, ','.join([str(i) for i in feature_d[k]])) for k in feature_d])
        fw_s(f_feature, s)
    
        s = '
    '.join(['{}
    {}'.format(k, len(feature_d[k])) for k in feature_d])
        fw_s(f_feature_ele_num, s)
    
        # 原始数据持久化完毕<---
    
        # --->对原始数据做加工,生成新数据
    
        # 0级别特征,将原始一级特征中"kw1,kw2,"合并
        feature_reduce_l = [i if re.search('d', i) is None else i[0:re.search('d', i).endpos - 1] for i in
                            feature_complete_l]
        # set  破坏了顺序
        print(feature_reduce_l)
        print(list(set(feature_reduce_l)))
    
        feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i]
        print(feature_reduce_l)
        s = '
    '.join(feature_reduce_l)
        fw_s(f_reduce, s)
    
        relative_, absolute_ = 2 / 3, 50
        sparse_num_drop_max = min(
            [absolute_,
             max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0:int(len(feature_d) * relative_)])])
    
        s = '
    '.join(
            ['{}
    {}'.format(k, ','.join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d])
    
        fw_s(f_output, s)
    
        # 特征的属性值
        feature_extend_d = {}
        is_odd_line = True
        with open(f_output, 'r') as fr:
            for i in fr:
                l = i.replace('
    ', '').split(',')
                if is_odd_line == True:
                    is_odd_line = False
                    k = l[0]
                    feature_extend_d[k] = []
                else:
                    is_odd_line = True
                    if len(l) <= sparse_num_drop_max:
                        for ii in l:
                            feature_extend_d[k].append(ii)
                    else:
                        feature_extend_d[k].append(0)
        feature_extend_l = []
    
        feature_complete_l.pop(feature_complete_l.index('uid'))
    
        feature_extend_l = '|'.join(
            ['|'.join(['{}_{}'.format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split('|')
    
        print(feature_extend_l)
        s = ','.join(feature_extend_l)
        fw_s(f_extend, s)
    
    # 生成缩小的数据集,测试spark join  广告特征做广播
    
    # feature_extend_l, f_extend = [], 'toknowitExtend1526836898.txt'
    #
    # with open(f_extend, 'r') as fr:
    #     feature_extend_l = [i.replace('
    ', '') for i in fr][0].split(',')
    
    d = 8
    

      

    age_0,age_1,age_2,age_3,age_4,age_5,gender_0,gender_1,gender_2,marriageStatus_0,marriageStatus_2,marriageStatus_3,marriageStatus_5,marriageStatus_6,marriageStatus_9,marriageStatus_10,marriageStatus_11,marriageStatus_12,marriageStatus_13,marriageStatus_14,marriageStatus_15,education_0,education_1,education_2,education_3,education_4,education_5,education_6,education_7,consumptionAbility_0,consumptionAbility_1,consumptionAbility_2,LBS_0,LBS_1,LBS_4,LBS_6,LBS_7,LBS_8,LBS_9,LBS_13,LBS_14,LBS_15,LBS_16,LBS_18,LBS_19,LBS_21,LBS_25,LBS_27,LBS_29,LBS_32,LBS_33,LBS_35,LBS_38,LBS_39,LBS_41,LBS_43,LBS_45,LBS_46,LBS_47,LBS_48,LBS_49,LBS_52,LBS_54,LBS_56,LBS_57,LBS_61,LBS_62,LBS_63,LBS_64,LBS_66,LBS_69,LBS_71,LBS_72,LBS_73,LBS_75,LBS_77,LBS_78,LBS_81,LBS_83,LBS_84,LBS_85,LBS_86,interest1_1,interest1_2,interest1_3,interest1_4,interest1_5,interest1_6,interest1_7,interest1_8,interest1_9,interest1_10,interest1_11,interest1_12,interest1_13,interest1_14,interest1_15,interest1_16,interest1_17,interest1_18,interest1_19,interest1_20,interest1_21,interest1_22,interest1_23,interest1_24,interest1_25,interest1_26,interest1_27,interest1_28,interest1_29,interest1_30,interest1_31,interest1_32,interest1_33,interest1_34,interest1_35,interest1_36,interest1_37,interest1_38,interest1_39,interest1_40,interest1_41,interest1_42,interest1_43,interest1_44,interest1_45,interest1_46,interest1_47,interest1_48,interest1_49,interest1_50,interest2_1,interest2_2,interest2_3,interest2_4,interest2_5,interest2_6,interest2_7,interest2_8,interest2_9,interest2_10,interest2_11,interest2_12,interest2_13,interest2_14,interest2_15,interest2_16,interest2_17,interest2_18,interest2_19,interest2_20,interest2_21,interest2_22,interest2_23,interest2_24,interest2_25,interest2_26,interest2_27,interest2_28,interest2_29,interest2_30,interest2_31,interest2_32,interest2_33,interest2_35,interest2_36,interest2_37,interest2_38,interest2_39,interest2_40,interest2_41,interest2_42,interest2_43,interest2_44,interest2_45,interest2_46,interest2_47,interest2_48,interest2_49,interest2_50,interest2_51,interest3_1,interest3_2,interest3_3,interest3_4,interest3_5,interest3_6,interest3_7,interest3_8,interest3_9,interest3_10,interest4_1,interest4_2,interest4_3,interest4_4,interest4_5,interest4_6,interest4_7,interest4_8,interest4_9,interest4_10,interest5_1,interest5_2,interest5_3,interest5_4,interest5_5,interest5_6,interest5_7,interest5_8,interest5_9,interest5_10,interest5_11,interest5_12,interest5_13,interest5_14,interest5_15,interest5_16,interest5_17,interest5_18,interest5_19,interest5_20,interest5_21,interest5_22,interest5_23,interest5_24,interest5_25,interest5_26,interest5_27,interest5_28,interest5_29,interest5_30,interest5_31,interest5_32,interest5_33,interest5_34,interest5_35,interest5_36,interest5_37,interest5_38,interest5_39,interest5_40,interest5_41,interest5_42,interest5_43,interest5_44,interest5_45,interest5_46,interest5_47,interest5_48,interest5_49,interest5_50,kw1_13,kw1_19,kw1_28,kw1_69,kw1_70,kw1_72,kw1_87,kw1_92,kw1_105,kw1_106,kw1_109,kw1_119,kw1_121,kw1_123,kw1_133,kw1_136,kw1_145,kw1_152,kw1_157,kw1_163,kw1_169,kw1_176,kw1_177,kw1_180,kw1_181,kw1_191,kw1_209,kw1_235,kw1_242,kw1_249,kw1_278,kw1_279,kw1_289,kw1_295,kw1_313,kw1_317,kw1_321,kw1_336,kw1_341,kw1_344,kw1_354,kw1_358,kw1_366,kw1_367,kw1_370,kw1_376,kw1_378,kw1_380,kw1_382,kw1_391,kw2_2,kw2_10,kw2_11,kw2_34,kw2_46,kw2_47,kw2_50,kw2_55,kw2_62,kw2_63,kw2_69,kw2_70,kw2_76,kw2_87,kw2_91,kw2_100,kw2_114,kw2_116,kw2_117,kw2_123,kw2_124,kw2_127,kw2_129,kw2_135,kw2_137,kw2_142,kw2_144,kw2_151,kw2_158,kw2_163,kw2_168,kw2_174,kw2_177,kw2_180,kw2_184,kw2_192,kw2_196,kw2_197,kw2_200,kw2_202,kw2_215,kw2_216,kw2_217,kw2_223,kw2_235,kw2_237,kw2_240,kw2_241,kw2_246,kw2_250,kw3_7,kw3_27,kw3_29,kw3_68,kw3_80,kw3_88,kw3_95,kw3_101,kw3_138,kw3_171,kw3_186,kw3_197,kw3_198,kw3_206,kw3_213,kw3_223,kw3_248,kw3_263,kw3_273,kw3_302,kw3_316,kw3_336,kw3_349,kw3_362,kw3_381,kw3_401,kw3_412,kw3_416,kw3_453,kw3_465,kw3_470,kw3_488,kw3_513,kw3_534,kw3_549,kw3_560,kw3_570,kw3_581,kw3_586,kw3_598,kw3_610,kw3_627,kw3_633,kw3_638,kw3_668,kw3_685,kw3_692,kw3_694,kw3_695,kw3_701,topic1_0,topic1_1,topic1_2,topic1_3,topic1_4,topic1_5,topic1_6,topic1_7,topic1_9,topic1_10,topic1_11,topic1_12,topic1_13,topic1_14,topic1_15,topic1_16,topic1_17,topic1_18,topic1_19,topic1_20,topic1_21,topic1_22,topic1_23,topic1_24,topic1_25,topic1_26,topic1_27,topic1_28,topic1_29,topic1_30,topic1_31,topic1_32,topic1_33,topic1_34,topic1_35,topic1_36,topic1_37,topic1_38,topic1_39,topic1_40,topic1_41,topic1_42,topic1_43,topic1_44,topic1_45,topic1_46,topic1_47,topic1_48,topic1_49,topic1_50,topic2_0,topic2_2,topic2_3,topic2_4,topic2_5,topic2_6,topic2_7,topic2_9,topic2_10,topic2_11,topic2_13,topic2_14,topic2_15,topic2_16,topic2_17,topic2_19,topic2_20,topic2_21,topic2_22,topic2_24,topic2_25,topic2_26,topic2_27,topic2_28,topic2_29,topic2_30,topic2_31,topic2_32,topic2_33,topic2_34,topic2_35,topic2_36,topic2_39,topic2_40,topic2_41,topic2_42,topic2_43,topic2_44,topic2_45,topic2_46,topic2_47,topic2_48,topic2_49,topic2_50,topic2_51,topic2_52,topic2_53,topic2_54,topic2_55,topic2_56,topic3_3,topic3_10,topic3_11,topic3_14,topic3_18,topic3_24,topic3_28,topic3_30,topic3_31,topic3_33,topic3_39,topic3_42,topic3_43,topic3_47,topic3_53,topic3_55,topic3_56,topic3_58,topic3_59,topic3_60,topic3_62,topic3_66,topic3_68,topic3_70,topic3_72,topic3_76,topic3_78,topic3_79,topic3_81,topic3_84,topic3_87,topic3_90,topic3_92,topic3_99,topic3_100,topic3_101,topic3_109,topic3_111,topic3_112,topic3_119,topic3_121,topic3_123,topic3_124,topic3_127,topic3_130,topic3_136,topic3_137,topic3_138,topic3_139,topic3_141,appIdInstall_1,appIdInstall_4,appIdInstall_6,appIdInstall_9,appIdInstall_10,appIdInstall_11,appIdInstall_12,appIdInstall_15,appIdInstall_16,appIdInstall_17,appIdInstall_19,appIdInstall_21,appIdInstall_23,appIdInstall_26,appIdInstall_27,appIdInstall_28,appIdInstall_29,appIdInstall_32,appIdInstall_34,appIdInstall_35,appIdInstall_39,appIdInstall_40,appIdInstall_41,appIdInstall_42,appIdInstall_43,appIdInstall_44,appIdInstall_45,appIdInstall_47,appIdInstall_48,appIdInstall_49,appIdInstall_51,appIdInstall_52,appIdInstall_55,appIdInstall_56,appIdInstall_57,appIdInstall_58,appIdInstall_60,appIdInstall_61,appIdInstall_62,appIdInstall_63,appIdInstall_65,appIdInstall_67,appIdInstall_68,appIdInstall_69,appIdInstall_70,appIdInstall_71,appIdInstall_73,appIdInstall_74,appIdInstall_76,appIdInstall_77,appIdAction_2,appIdAction_4,appIdAction_5,appIdAction_7,appIdAction_8,appIdAction_11,appIdAction_13,appIdAction_14,appIdAction_16,appIdAction_17,appIdAction_27,appIdAction_30,appIdAction_32,appIdAction_33,appIdAction_34,appIdAction_35,appIdAction_36,appIdAction_37,appIdAction_38,appIdAction_39,appIdAction_40,appIdAction_41,appIdAction_43,appIdAction_44,appIdAction_45,appIdAction_47,appIdAction_50,appIdAction_51,appIdAction_52,appIdAction_53,appIdAction_55,appIdAction_56,appIdAction_60,appIdAction_62,appIdAction_65,appIdAction_66,appIdAction_69,appIdAction_70,appIdAction_71,appIdAction_72,appIdAction_74,appIdAction_75,appIdAction_76,appIdAction_77,appIdAction_80,appIdAction_81,appIdAction_83,appIdAction_84,appIdAction_85,appIdAction_91,ct_0,ct_1,ct_2,ct_3,ct_4,os_0,os_1,os_2,carrier_0,carrier_1,carrier_2,carrier_3,house_1
    

      

    JOIN 操作转移至Spark

    import re, time
    
    f = 'userFeature.data'
    
    '''
    17 marriageStatus 11
    19 marriageStatus 2 13
    20 marriageStatus 13 10
    16 marriageStatus 0
    21 marriageStatus 2 13 9
    22 marriageStatus 12 13 9
    23 marriageStatus 12 13 10
    
     11 marriageStatus 11
     5 13 marriageStatus 5 13
     13 10 marriageStatus 13 10
     10 marriageStatus 10
     15 marriageStatus 15
     0 marriageStatus 0
     13 15 marriageStatus 13 15
     12 13 marriageStatus 12 13
     13 marriageStatus 13
     6 13 marriageStatus 6 13
     2 13 marriageStatus 2 13
     13 9 marriageStatus 13 9
     6 13 9 marriageStatus 6 13 9
     2 13 9 marriageStatus 2 13 9
     5 13 9 marriageStatus 5 13 9
     12 13 9 marriageStatus 12 13 9
     14 marriageStatus 14
     12 13 10 marriageStatus 12 13 10
     3 marriageStatus 3
     15 10 marriageStatus 15 10
     8 marriageStatus 8
     6 13 10 marriageStatus 6 13 10
     5 13 10 marriageStatus 5 13 10
     13 10 9 marriageStatus 13 10 9
     13 15 10 marriageStatus 13 15 10
     2 13 10 marriageStatus 2 13 10
    
     marriageStatus 0 2 5 6 8 9 10 11 12 13 15
    '''
    
    
    def fw_s(f, s):
        with open(f, 'w') as fw:
            fw.write(s)
    
    
    # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征
    # feature_order_l 获取一级特征的顺序
    
    # 原始数据 特征挖掘
    # 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序,
    f_feature = 'toknowit.txt'
    f_feature_ele_num = f_feature.replace('.', 'EleNum.')
    f_feature_incomplete, f_feature_complete = f_feature.replace('.', 'Incomplete.'), f_feature.replace('.',
                                                                                                        'Complete.')
    
    # 原始数据 加工成生产数据
    # 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定
    f_reduce, f_output = f_feature.replace('.', 'Reduce.'), f_feature.replace('.', 'Output.')
    
    # <---以上生成的文件无参数,数据恒定
    
    
    # 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳
    
    f_extend = f_feature.replace('.', 'Extend{}.'.format(int(time.time())))
    to_write_immutable_file = True
    # to_write_immutable_file = False
    if to_write_immutable_file:
        feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000
    
        # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合
    
        with open(f, 'r') as fr:
            for i in fr:
                l = i.replace('
    ', '').split('|')
                feature_incomplete_rows_l_this = []
                for ii in l:
                    ll = ii.split(' ')
                    k = ll[0]
                    feature_incomplete_rows_l_this.append(k)
                    if k == 'uid':
                        continue
                    if k not in feature_d:
                        feature_d[k] = []
                    # order -->int
                    for iii in ll[1:]:
                        feature_d[k].append(int(iii))
                feature_incomplete_rows_l.append(feature_incomplete_rows_l_this)
                reduce_chk_counter += 1
                print(reduce_chk_counter)
                if reduce_chk_counter % reduce_chk_step == 0:
                    # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数
                    for k in feature_d:
                        feature_d[k] = list(set(feature_d[k]))
                        feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                                     feature_incomplete_rows_l.index(e) == i]
                        # subset TEST
                        # break
    
        for k in feature_d:
            feature_d[k] = sorted(list(set(feature_d[k])), reverse=False)
    
        feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                     feature_incomplete_rows_l.index(e) == i]
    
        s = '
    '.join([','.join(l) for l in feature_incomplete_rows_l])
        fw_s(f_feature_incomplete, s)
    
        feature_after_e_d = {}
        for l in feature_incomplete_rows_l:
            for e in l:
                if e not in feature_after_e_d:
                    feature_after_e_d[e] = []
                feature_after_e_d[e] += l[l.index(e) + 1:]
                feature_after_e_d[e] = list(set(feature_after_e_d[e]))
    
        # 原始一级特征b
        feature_complete_l = [k for k in
                              sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)]
        print(feature_complete_l)
    
        s = '
    '.join(feature_complete_l)
        fw_s(f_feature_complete, s)
    
        print(feature_complete_l)
    
        feature_d_ = {}
        for feature in feature_complete_l:
            if feature == 'uid':
                continue
            feature_d_[feature] = feature_d[feature]
        del feature_d
        feature_d = feature_d_
    
        s = '
    '.join(['{}
    {}'.format(k, ','.join([str(i) for i in feature_d[k]])) for k in feature_d])
        fw_s(f_feature, s)
    
        s = '
    '.join(['{}
    {}'.format(k, len(feature_d[k])) for k in feature_d])
        fw_s(f_feature_ele_num, s)
    
        # 原始数据持久化完毕<---
    
        # --->对原始数据做加工,生成新数据
    
        # 0级别特征,将原始一级特征中"kw1,kw2,"合并
        feature_reduce_l = [i if re.search('d', i) is None else i[0:re.search('d', i).endpos - 1] for i in
                            feature_complete_l]
        # set  破坏了顺序
        print(feature_reduce_l)
        print(list(set(feature_reduce_l)))
    
        feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i]
        print(feature_reduce_l)
        s = '
    '.join(feature_reduce_l)
        fw_s(f_reduce, s)
    
        relative_, absolute_ = 2 / 3, 50
        sparse_num_drop_max = min(
            [absolute_,
             max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0:int(len(feature_d) * relative_)])])
    
        s = '
    '.join(
            ['{}
    {}'.format(k, ','.join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d])
    
        fw_s(f_output, s)
    
        # 特征的属性值
        feature_extend_d = {}
        is_odd_line = True
        with open(f_output, 'r') as fr:
            for i in fr:
                l = i.replace('
    ', '').split(',')
                if is_odd_line == True:
                    is_odd_line = False
                    k = l[0]
                    feature_extend_d[k] = []
                else:
                    is_odd_line = True
                    if len(l) <= sparse_num_drop_max:
                        for ii in l:
                            feature_extend_d[k].append(ii)
                    else:
                        feature_extend_d[k].append(0)
        feature_extend_l = []
    
        feature_complete_l.pop(feature_complete_l.index('uid'))
    
        feature_extend_l = '|'.join(
            ['|'.join(['{}_{}'.format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split('|')
    
        print(feature_extend_l)
        s = ','.join(feature_extend_l)
        fw_s(f_extend, s)
    
    # 生成缩小的数据集,测试spark join  广告特征做广播
    ori_l, extend_l = [], []
    with open('toknowitComplete.txt', 'r')as fr:
        ori_l = [i.replace('
    ', '') for i in fr]
    
    feature_extend_l, f_feature_extend = [], f_extend
    with open(f_feature_extend, 'r')as fr:
        extend_l = [i.replace('
    ', '').split(',') for i in fr][0]
    
    ori_extend_d = {}
    
    for ori in ori_l:
        for extend_ in extend_l:
            if ori in extend_:
                if ori not in ori_extend_d:
                    ori_extend_d[ori] = {}
                extend_d = {extend_: 0}
                ori_extend_d[ori][extend_] = 0
    import copy
    
    ori_extend_d_ = copy.deepcopy(ori_extend_d)
    for i in ori_extend_d_['age']:
        if 'marriageStatus' in i:
            del ori_extend_d['age'][i]
    del ori_extend_d_
    
    '''
    1-生成数据元结构,末端值全为0
    
    2-拿到每行数据,去更新末端值
    
    '''
    c_ = 0
    rows_d_l = []
    with open(f, 'r') as fr:
        for i in fr:
            # c_ += 1
            # if c_ == 6:
            #     break
    
            ori_row_l = i.replace('
    ', '').split('|')
            ori_extend_d_this = copy.deepcopy(ori_extend_d)
            uid_d = {}
            for ii in ori_row_l:
                l = ii.split(' ')
                print(l)
                feature_ori, val_l = l[0], l[1:]
                if feature_ori == 'uid':
                    uid = val_l[0]
                    continue
                if len(ori_extend_d[feature_ori]) == 1:
                    for feature_sub in ori_extend_d_this[feature_ori]:
                        print(feature_sub)
                        ori_extend_d_this[feature_ori][feature_sub] = 1 if int(val_l[0]) > 0 else 0
                else:
                    for val_ in val_l:
                        feature_sub = '{}_{}'.format(feature_ori, val_)
                        print(feature_sub)
                        if feature_sub in ori_extend_d_this[feature_ori]:  ###多余的校验
                            ori_extend_d_this[feature_ori][feature_sub] = 1  # ????
            uid_d[uid] = ori_extend_d_this
            del ori_extend_d_this
            rows_d_l.append(uid_d)
            del uid_d
    
    s_l = []
    f_userdata_extend = f.replace('.data', '{}.data'.format(int(time.time())))
    for d in rows_d_l:
        for uid in d:
            c_ += 1
            l = []
            d_ = d[uid]
            for feature_ in d_:
                for feature_sub in d_[feature_]:
                    l.append(d_[feature_][feature_sub])
            s = '{},{}'.format(uid, ','.join([str(i) for i in l]))
            s_l.append(s)
    fw_s(f_userdata_extend, '
    '.join(s_l))
    print(c_)
    
    '''
    gen JOIN data FOR DNN
    
    '''
    f_user = 'userFeature.data'
    f_ad = 'adFeature.csv'
    f_user_extend = f_userdata_extend
    f_train = 'train.csv'
    f_test = 'test2.csv'
    
    '''
    gen head
    '''
    csv_head = 'advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType'
    f_toknowitExtend = 'toknowitExtend1527038949.txt'
    try:
        with open(f_toknowitExtend, 'r') as fr:
            for i in fr:
                csv_head = 'label,{},{}'.format(i.replace('
    ', ''), csv_head)
        print(csv_head)
    except Exception as e:
        print(e)
        csv_head = ''  # no  file
    '''
    get dict
    '''
    
    ad_d = {}
    with open(f_ad, 'r') as fr:
        for i in fr:
            if 'aid' in i:
                continue
            l = i.replace('
    ', '').split(',')
            aid = l[0]
            ad_d[aid] = ','.join(l[1:])
    
    uid_d = {}
    with open(f_user_extend, 'r') as fr:
        for i in fr:
            if 'aid' in i:
                continue
            l = i.replace('
    ', '').split(',')
            uid = l[0]
            uid_d[uid] = ','.join(l[1:])
    '''
    gen train data
    '''
    dnn_csvTRAIN = 'dnn_csvTRAIN{}.csv'.format(int(time.time()))
    with open(dnn_csvTRAIN, 'w') as fa:
        fa.write(csv_head)
    with open(f_train, 'r') as fr:
        for i in fr:
            if 'aid' in i:
                continue
            try:
                l = i.replace('
    ', '').replace(' ', '').split(',')
                print(l)
                aid, uid, label = l
                s = '{},{},{}
    '.format(label, uid_d[uid], ad_d[aid])
                with open(dnn_csvTRAIN, 'a') as fa:
                    fa.write(s)
            except Exception as e:
                print(e)
    
    '''
    gen test data
    '''
    
    dnn_csvTEST = 'dnn_csvTEST{}.csv'.format(int(time.time()))
    with open(dnn_csvTEST, 'w') as fa:
        fa.write(csv_head)
    with open(f_test, 'r') as fr:
        for i in fr:
            if 'aid' in i:
                continue
            try:
                break
                l = i.replace('
    ', '').replace(' ', '').split(',')
                print(l)
                # aid, uid, label = l
                aid, uid = l
                label = 0
                s = '{},{},{}
    '.format(label, uid_d[uid], ad_d[aid])
                with open(dnn_csvTEST, 'a') as fa:
                    fa.write(s)
            except Exception as e:
                print(e)
    
    dd = 9
  • 相关阅读:
    第19章_枚举:
    第14章_类型信息:
    第13章_字符串:
    纯css背景图自适应
    事务隔离机制_悲观锁_乐观锁
    hibernate 缓存
    list 和 iterate
    hibernate 性能优化之 1+N 问题
    QBC(Query By Criteria) QBE (Query By Example)
    HQL(Hibernate Query Language)
  • 原文地址:https://www.cnblogs.com/rsapaper/p/9058141.html
Copyright © 2011-2022 走看看