zoukankan      html  css  js  c++  java
  • 字段分隔,每个字段一列

    #将原文件存入字典
    import os
    import copy
    import codecs 
    os.chdir('/Users/zhangb/Desktop/数据挖掘文件/取数流程')
    f_in = codecs.open('yalu1115','r','utf-8')
    source_dic = {}
    for i in f_in.readlines():
        line = i.strip().split('|')
        key = '|'.join(line[0:9])
        value = line[-1].split(',')
        #print value 
        source_dic[key] = value
    f_in.close()
    #将维表写入字典  {'011100':['年龄段','0-17岁']}
    f_in = codecs.open('dim_tags.txt','r','utf-8')
    dim_dic = {}
    for i in f_in.readlines():
        line = i.strip().split('|')
        dim_dic[line[0]] = [line[1],line[2]]  
    f_in.close()
    #print(dim_dic.keys())
    #生成middle层转化用的的索引
    ind = [i for i in dim_dic.keys()]
    #print(ind)
    #生成中间层,将原表中的tag_id替换成为 tag_name,若匹配不上则去除。
    middle_dic = {}
    for k in source_dic.keys():
        middle_dic[k] = []
    for k,v in source_dic.items():
        for i in ind:
            if i in v:
                middle_dic[k].append(dim_dic[i][1])
            else:
                middle_dic[k].append('')
    #print(middle_dic)
    #按表头来组织中间层,确保每个id都有所有的tag_name字段,没有的tag则留空,
    #要求所有人的tag字段都对齐,比如'男'的列位上,只能是'男' 或者空字符
    ff = codecs.open('dim_tags_name.txt','r','utf-8') 
    sorted_list = [i.strip() for i  in ff.readlines()]
    #print(sorted_list)
    ff.close()
    sort_dic = {}
    for k in source_dic.keys():
        sort_dic[k] = []
    for k,v in middle_dic.items():
        for i in sorted_list:
            if i in v:
                sort_dic[k].append(i)
            else:
                sort_dic[k].append('')
    #print(sort_dic)
    #另外可以生成一个0-1矩阵,适合计算。只要把i换成1。
    #现在已经有了对齐的列表,剩下就是把这些列归类,不一定要将value中的同类元素打包成元组
    #把前8个变量分别弄成1列
    for k,v in sort_dic.items():
    
        #年龄1
        age=''
        for i in range(6):
            #print(v[i])
            if len(v[i])>0:
                age=v[i]
        #性别精准2 
        gender_true=''      
        for i in range(6,8):
            if len(v[i])>0:
               gender_true=v[i]    
        #性别3 
        gender=''   
        for i in range(8,10):
            if len(v[i])>0:
               gender=v[i]    
        #有小孩4    
        parent=''   
        for i in range(10,18):
            if len(v[i])>0:
               parent=v[i]    
        #消费水平5  
        consumption=''   
        for i in range(18,21):
            if len(v[i])>0:
               consumption=v[i]    
        #婚姻状况6
        marital_status=''   
        for i in range(21,24):
            if len(v[i])>0:
               marital_status=v[i]    
        #职业状态7 
        occupation=''    
        for i in range(24,32):
            if len(v[i])>0:
              occupation=v[i]   
        #性取向8 
        sexual_orientation=''
        for i in range(32,35):
            if len(v[i])>0:
               sexual_orientation=v[i]
            
        v1=v[35:]
        v2=[age,gender_true,gender,parent,consumption,marital_status,occupation,sexual_orientation]
        v3=v2+v1
        sort_dic[k]=v3
    ftags = codecs.open('done_yalu1115','w','utf-8')
    for k,v in sort_dic.items():
        ftags.write(k+'|'+'|'.join(v)+'
    ')
    ftags.close()
  • 相关阅读:
    Lintcode423-Valid Parentheses-Easy
    Lintcode97-Maximum Depth of Binary Tree-Easy
    Lintcode175-Revert Binary Tree-Easy
    Lintcode469-Same Tree-Easy
    Leetcode480-Binary Tree Paths-Easy
    Lintcode481-Binary Tree Leaf Sum-Easy
    Lintcode482-Binary Tree Level Sum-Easy
    Lintcode376-Binary Tree Path Sum-Easy
    SQL
    Database
  • 原文地址:https://www.cnblogs.com/zhangbojiangfeng/p/6077528.html
Copyright © 2011-2022 走看看