#将原文件存入字典
import os
import copy
import codecs
os.chdir('/Users/zhangb/Desktop/数据挖掘文件/取数流程')
f_in = codecs.open('yalu1115','r','utf-8')
source_dic = {}
for i in f_in.readlines():
line = i.strip().split('|')
key = '|'.join(line[0:9])
value = line[-1].split(',')
#print value
source_dic[key] = value
f_in.close()
#将维表写入字典 {'011100':['年龄段','0-17岁']}
f_in = codecs.open('dim_tags.txt','r','utf-8')
dim_dic = {}
for i in f_in.readlines():
line = i.strip().split('|')
dim_dic[line[0]] = [line[1],line[2]]
f_in.close()
#print(dim_dic.keys())
#生成middle层转化用的的索引
ind = [i for i in dim_dic.keys()]
#print(ind)
#生成中间层,将原表中的tag_id替换成为 tag_name,若匹配不上则去除。
middle_dic = {}
for k in source_dic.keys():
middle_dic[k] = []
for k,v in source_dic.items():
for i in ind:
if i in v:
middle_dic[k].append(dim_dic[i][1])
else:
middle_dic[k].append('')
#print(middle_dic)
#按表头来组织中间层,确保每个id都有所有的tag_name字段,没有的tag则留空,
#要求所有人的tag字段都对齐,比如'男'的列位上,只能是'男' 或者空字符
ff = codecs.open('dim_tags_name.txt','r','utf-8')
sorted_list = [i.strip() for i in ff.readlines()]
#print(sorted_list)
ff.close()
sort_dic = {}
for k in source_dic.keys():
sort_dic[k] = []
for k,v in middle_dic.items():
for i in sorted_list:
if i in v:
sort_dic[k].append(i)
else:
sort_dic[k].append('')
#print(sort_dic)
#另外可以生成一个0-1矩阵,适合计算。只要把i换成1。
#现在已经有了对齐的列表,剩下就是把这些列归类,不一定要将value中的同类元素打包成元组
#把前8个变量分别弄成1列
for k,v in sort_dic.items():
#年龄1
age=''
for i in range(6):
#print(v[i])
if len(v[i])>0:
age=v[i]
#性别精准2
gender_true=''
for i in range(6,8):
if len(v[i])>0:
gender_true=v[i]
#性别3
gender=''
for i in range(8,10):
if len(v[i])>0:
gender=v[i]
#有小孩4
parent=''
for i in range(10,18):
if len(v[i])>0:
parent=v[i]
#消费水平5
consumption=''
for i in range(18,21):
if len(v[i])>0:
consumption=v[i]
#婚姻状况6
marital_status=''
for i in range(21,24):
if len(v[i])>0:
marital_status=v[i]
#职业状态7
occupation=''
for i in range(24,32):
if len(v[i])>0:
occupation=v[i]
#性取向8
sexual_orientation=''
for i in range(32,35):
if len(v[i])>0:
sexual_orientation=v[i]
v1=v[35:]
v2=[age,gender_true,gender,parent,consumption,marital_status,occupation,sexual_orientation]
v3=v2+v1
sort_dic[k]=v3
ftags = codecs.open('done_yalu1115','w','utf-8')
for k,v in sort_dic.items():
ftags.write(k+'|'+'|'.join(v)+'
')
ftags.close()