一、解析用户原始信息的json文件
#!/usr/bin/python # -*- coding=utf-8 -*- import os import sys import json def main(): root_dir = sys.argv[1] province_file = root_dir +"/conf/province.list" fin = open(province_file, 'r') provinces = set() for line in fin: province = line.strip() provinces.add(province) fin.close() input_file = root_dir +"/source_data/userinfo.json" output_file = root_dir +"/result_data/userinfo.data" fin = open(input_file, 'r') fout = open(output_file, 'w') for line in fin: if line.strip() == "[]": continue json_file = json.loads(line.strip()) userid = json_file['userId'] sex = json_file['sex'] location = json_file['location'] birthday = json_file['birthday'] attentioncount = json_file['attentionCount'] fanscount = json_file['fansCount'] weibocount = json_file['weiboCount'] label_list=json_file['labelList'] user_introduce=json_file['userIntroduce'] if not sex: sex = 'null' if location.find(' ') != -1: fields = location.split(' ') location = fields[0] elif location: for province in provinces: if location.find(province) != -1: location = province if not location : location = 'null' index = birthday.find('年') if index != -1: birthday = birthday[0:index] else: birthday = 'null' if not attentioncount: attentioncount = '0' if not fanscount: fanscount = '0' if not weibocount: weibocount = '0' if not label_list or not label_list.strip(): label_list='null' if not user_introduce or not user_introduce.strip(): user_introduce='null' print>>fout, "%s %s %s %s %s %s %s %s %s"%(userid, sex, location, birthday, attentioncount, fanscount, weibocount,label_list,user_introduce) fin.close() fout.close() if __name__ == "__main__": main()
1,用户的标签需要进一步进行分词处理
2,根据这份数据,打了标签的用户大概占据总用户的1/3
3,对于没有标签的,这里使用的是null标注
如果需要删除没有标签的记录,那么相关的shell语句为:
cat userinfo.data | awk -F ' ' '{print $8}'|sed /null/d cat userinfo.data | cut -f 8|sed /null/d cat userinfo.data | awk -F ' ' {if $8!=null print $8}
二、进行映射以及排序的一些操作
#!/usr/bin/python import os import sys def main(): root_dir = sys.argv[1] topN = int(sys.argv[2]) topic_total_file = root_dir +'/result_data/topic_id.data.total' id_topic = {} fin = open(topic_total_file, 'r') for line in fin: fields = line.strip().split(' ') id_topic[fields[1]] = fields[0] fin.close() topicid_count = {} sources = ['sina', 'tencent'] for source in sources: input_file = root_dir +'/result_data/'+ source +'.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == '-1': continue topics = fields[2].split(':') for topic in topics: if topic in topicid_count: topicid_count[topic] += 1 else: topicid_count[topic] = 1 fin.close() sort_topic = sorted(topicid_count.items(), key = lambda d:d[1], reverse=True) if len(sort_topic) < topN: topN = len(sort_topic) output_file = root_dir +'/result_data/topic_id.data' fout = open(output_file, 'w') for i in range(topN): print>>fout, "%s %s %s"%(sort_topic[i][0], id_topic[sort_topic[i][0]], topicid_count[sort_topic[i][0]]) fout.close() if __name__ == "__main__": main()
1)构建两个列表,一个存储id和topic的对应关系,一个存储id和该topic出现次数的对应关系
2)按照次数排序的时候采用sorted(dict.items(),key=lambda d:d[1],reverse=True)进行,这样排序之后得到一个元组构成的列表
三、文档id分配、停用词处理
#!/usr/bin/python import os import sys def main(): if len(sys.argv) != 4: print "error parameters!" sys.exit(0) root_dir = sys.argv[1][0:sys.argv[1].rfind('/')] input_dir = sys.argv[1] output_root_dir = sys.argv[2] topic_multiple = float(sys.argv[3]) # stopwords stopwords_file = root_dir +'/conf/stopwords.list' fin = open(stopwords_file, 'r') stopwords = set() for line in fin: word = line.strip() stopwords.add(word) fin.close() # generate ntopics_alpha.data cmd = "wc -l "+ root_dir +"/result_data/topic_id.data | awk -F' ' '{print $1}'" num_topics = int(int(os.popen(cmd).read().strip()) * topic_multiple) alpha = 50 / float(num_topics) ntopics_alpha_file = output_root_dir +'/ntopics_alpha.data' fout = open(ntopics_alpha_file, 'w') print>>fout, "%s %s"%(num_topics, alpha) fout.close() # allocate docid and remove stopwords source_list = ['sina', 'tencent', 'tianya'] for source in source_list: input_file = input_dir +'/'+ source +'.data' cmd = "wc -l "+ input_file +" | awk -F' ' '{print $1}'" line_number = os.popen(cmd).read().strip() output_file = output_root_dir +'/'+ source +'/source.data' fin = open(input_file, 'r') fout = open(output_file, 'w') print>>fout, line_number docid = {} allocate_id = 0 for line in fin: fields = line.strip().split(' ') doc = fields[0] docid[doc] = allocate_id allocate_id += 1 line = "" for word in fields[1].split(' '): if word.strip() and word not in stopwords: line += word +' ' if len(line) == 0: print>>fout, 'null' else: print>>fout, line fin.close() fout.close() docid_file = output_root_dir +'/'+ source +'/docid.map' fout = open(docid_file, 'w') for doc in docid: print>>fout, "%s %s"%(doc, docid[doc]) fout.close() if __name__ == "__main__": main()
1)如何去除停用词
2)如何为文档分配id,并将记录进行保存
3)在实际情况中,是将当前的文档id,词本身作为key还是进一步处理,其实可以看情况而定。
4, generate_nw_nd
#!/usr/bin/python import os import sys def main(): root_dir = sys.argv[1] cmd = "cat "+ root_dir +"/lda_model/ntopics_alpha.data | awk -F' ' '{print $1}' " num_topics = int(os.popen(cmd).read().strip()) source_list = ['sina', 'tencent', 'tianya'] for source in source_list: tassign_file = root_dir +'/lda_model/'+ source +'/model-final.tassign' nd_file = root_dir +'/lda_model/'+ source +'/nd.data' cmd = "head -1 "+ root_dir +"/lda_model/"+ source +"/wordmap.txt" num_tokens = int(os.popen(cmd).read().strip()) nw = [0 for i in range(num_topics * num_tokens)] fin = open(tassign_file, 'r') fout = open(nd_file, 'w') docid = 0 for line in fin: fields = line.strip().split(' ') nd = [0 for i in range(num_topics)] for pair in fields: parts = pair.split(':') wordid = int(parts[0]) topicid = int(parts[1]) nw[wordid*num_topics + topicid] += 1 nd[topicid] += 1 print>>fout, "%s %s"%(docid, " ".join([str(i) for i in nd])) docid += 1 fin.close() fout.close() nw_file = root_dir +'/lda_model/'+ source +'/nw.data' fout = open(nw_file, 'w') for wordid in range(num_tokens): line = '' for topicid in range(num_topics): line += str(nw[wordid*num_topics + topicid]) +' ' print>>fout, line fout.close() if __name__ == "__main__": main()
1) use list to do matrix
5,topic_mapping
#!/usr/bin/python import os import sys def similarity(real_vector, lda_vector): score = float(0) words = set() for word in real_vector: if word not in words: words.add(word) for word in lda_vector: if word not in words: words.add(word) real_list = [] lda_list = [] for word in words: if word in real_vector: real_list.append(real_vector[word]) else: real_list.append(float(0)) if word in lda_vector: lda_list.append(lda_vector[word]) else: lda_list.append(float(0)) for i in range(len(real_list)): score += real_list[i] * lda_list[i] return score def topic_mapping(realtopic_vector, ldatopic_vector): real_lda = {} for realtopic in realtopic_vector: max_topic = '0' max_score = float(0) for ldatopic in ldatopic_vector: score = similarity(realtopic_vector[realtopic], ldatopic_vector[ldatopic]) if score > max_score: max_topic = ldatopic max_score = score real_lda[realtopic] = max_topic return real_lda def main(): root_dir = sys.argv[1] twords = int(sys.argv[2]) realtopic_words = int(sys.argv[3]) source_list = ['sina', 'tencent', 'tianya'] # generate vsm of real topic topicid_file = root_dir +"/result_data/topic_id.data" realtopic_vsm = {} fin = open(topicid_file, 'r') for line in fin: fields = line.strip().split(' ') realtopic_vsm[fields[0]] = {} fin.close() topic_source_list = ['sina', 'tencent'] for topic_source in topic_source_list: input_file = root_dir +'/result_data/'+ topic_source +'.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') topicid = fields[2] if topicid == '-1': continue for topic in topicid.split(':'): if topic not in realtopic_vsm: continue for word in fields[1].split(' '): if word not in realtopic_vsm[topic]: realtopic_vsm[topic][word] = 1 else: realtopic_vsm[topic][word] += 1 fin.close() # generate vector of real topic realtopic_vector = {} for topic in realtopic_vsm: realtopic_vector[topic] = {} length = realtopic_words sorted_tmp = sorted(realtopic_vsm[topic].items(), key = lambda d:d[1], reverse=True) if len(sorted_tmp) < length: length = len(sorted_tmp) sum_count = 0 for i in range(length): sum_count += sorted_tmp[i][1] for i in range(length): realtopic_vector[topic][sorted_tmp[i][0]] = sorted_tmp[i][1] / float(sum_count) # mapping real topic with lda topic for source in source_list: input_file = root_dir +"/lda_model/"+ source +"/model-final.twords" # re-build topic vectoc ldatopic_vector = {} fin = open(input_file, 'r') cur_topic = "0" for line in fin: line = line.strip() if line.find('Topic') != -1: fields = line.split(' ') cur_topic = fields[1][0: fields[1].find('th')] ldatopic_vector[cur_topic] = {} else: fields = line.split(' ') word = fields[0] weight = float(fields[1]) if weight > 0.0: ldatopic_vector[cur_topic][word] = weight fin.close() real_lda = topic_mapping(realtopic_vector, ldatopic_vector) output_file = root_dir +"/lda_model/"+ source +"/topic_mapping.data" fout = open(output_file, 'w') for realtopic in real_lda: print>>fout, "%s %s"%(realtopic, real_lda[realtopic]) fout.close() if __name__ == "__main__": main()
1)set an real_topic to lda_topic
(the real_topic 's words are by counting;the lda_topic 's word are by training)
2) caculate the similarity of two dictory or two vector
6, final_data
#!/usr/bin/python import sys def main(): root_dir = sys.argv[1] topn = 2 # the top n topic is the real distribution of document source_list = ['sina', 'tencent', 'tianya'] for source in source_list: allocateid_ldatopic = {} # value is a list input_file = root_dir +'/lda_model/'+ source +'/nd.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') allocateid = fields[0] topic_distribution = {} for i in range(1, len(fields)-1): topic_distribution[i-1] = int(fields[i]) sorted_tmp = sorted(topic_distribution.items(), key = lambda d:d[1], reverse=True) allocateid_ldatopic[allocateid] = [] for i in range(topn): allocateid_ldatopic[allocateid].append(sorted_tmp[i][0]) fin.close() ldatopic_realtopic = {} # value is a list input_file = root_dir +'/lda_model/'+ source +'/topic_mapping.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') ldatopic = fields[1] realtopic = fields[0] if ldatopic not in ldatopic_realtopic: ldatopic_realtopic[ldatopic] = [realtopic] else: ldatopic_realtopic[ldatopic].append(realtopic) fin.close() userid_profile = {} input_file = root_dir +'/result_data/userinfo.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') userid = fields[0] sex = fields[1] location = fields[2] age = fields[3] fanscount = fields[5] weibocount = fields[6] userid_profile[userid] = [sex, location, age, fanscount, weibocount] fin.close() docid_allocateid = {} input_file = root_dir +'/lda_model/'+ source +'/docid.map' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') docid_allocateid[fields[0]] = fields[1] fin.close() # final.data input_file = root_dir +'/result_data/'+ source +'.data' output_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') fout = open(output_file, 'w') for line in fin: fields = line.strip().split(' ') docid = fields[0] allocateid = docid_allocateid[docid] topic_set = set() if fields[2] != '-1': for topic in fields[2].split(':'): if topic in topic_set: continue topic_set.add(topic) for ldatopic in allocateid_ldatopic[allocateid]: if str(ldatopic) not in ldatopic_realtopic: continue for topic in ldatopic_realtopic[str(ldatopic)]: if topic not in topic_set: topic_set.add(topic) if topic_set: topics = ':'.join(topic_set) else: topics = 'null' comment = fields[3] retweet = fields[4] praise = fields[5] userid = fields[6] if userid in userid_profile: user_profile = ' '.join(userid_profile[userid]) else: user_profile = 'null null null null null' print>>fout, "%s %s %s %s %s %s %s %s"%(docid, allocateid, topics, comment, retweet, praise, userid, user_profile) fin.close() fout.close() if __name__ == "__main__": main()
1) allocate each doc the top2 lda_topic
2) totate a dict's key and value, though the key is unique ,but the same value can project different key
3)allocate a doc some relate topic ,include it's own topic,as well as the top2 lda-projection topic
4)merge the allocateid_ldatopic, ldatopic_realtopic, userid_profile, docid_allocateid to a single file
7,Visualization
#!/usr/bin/python # -*- coding=utf-8 -*- import sys from string import Template def replace_template(template_file, replaceDict, output_file): fh = open(template_file, 'r') content = fh.read() fh.close() content_template = Template(content) content_final = content_template.safe_substitute(replaceDict) fout = open(output_file, 'w') fout.write(content_final) fout.close() def bar_categories(categories_list): categories = "[" for i in range(len(categories_list)): if i == len(categories_list)-1: categories += "'"+ categories_list[i] +"']" else: categories += "'"+ categories_list[i] +"'," return categories def bar_series(data_list): series = "[{ name: 'count', data: [" for i in range(len(data_list)): if i == len(data_list)-1: series += str(data_list[i]) +"]}]" else: series += str(data_list[i]) +"," return series def pie_data(data_map): data = "[" index = 0 for item in data_map: if index == len(data_map)-1: data += "['"+ str(item) +"',"+ str(data_map[item]) +"]" else: data += "['"+ str(item) +"',"+ str(data_map[item]) +"]," data += "]" return data def main(): root_dir = sys.argv[1] # topicid and topic's content topicid_content = {} input_file = root_dir +'/result_data/topic_id.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') topicid_content[fields[0]] = fields[1] fin.close() #1、话题分布 source_list = ['sina', 'tencent', 'tianya'] topicid_count = {} for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null': continue for topic in fields[2].split(':'): if topic in topicid_count: topicid_count[topic] += 1 else: topicid_count[topic] = 1 # Total topics sorted by its total count sorted_result = sorted(topicid_count.items(), key = lambda d:d[1], reverse=True) topN = 20 replaceDict = {} replaceDict['title'] = "'话题分布'" replaceDict['subtitle'] = "''" categories_list = [] for i in range(topN): categories_list.append(topicid_content[ sorted_result[i][0] ]) replaceDict['categories'] = bar_categories(categories_list) replaceDict['x_name'] = "'相关微博或帖子条数'" data_list = [] for i in range(topN): data_list.append(sorted_result[i][1]) replaceDict['series'] = bar_series(data_list) template_file = root_dir +'/template/horizontal_bar.tpl' output_file = root_dir +'/final_html/1.htm' replace_template(template_file, replaceDict, output_file) #2、话题分布变化趋势 #3、话题关注用户的男女比例 topN = 10 topicid_sex = {} for i in range(topN): topicid_sex[sorted_result[i][0]] = [0, 0] source_list = ['sina'] # we only has user profile of sina currently for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null' or fields[7] == 'null': continue for topic in fields[2].split(':'): if topic not in topicid_sex: continue if fields[7] == "男": topicid_sex[topic][0] += 1 if fields[7] == "女": topicid_sex[topic][1] += 1 fin.close() for i in range(topN): template_file = root_dir +'/template/pie.tpl' output_file = root_dir +'/final_html/3-'+ str(i) +'.htm' replaceDict = {} replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户男女比例'" sum_count = topicid_sex[sorted_result[i][0]][0] + topicid_sex[sorted_result[i][0]][1] sex_map = {} sex_map['男'] = topicid_sex[sorted_result[i][0]][0] / float(sum_count) sex_map['女'] = topicid_sex[sorted_result[i][0]][1] / float(sum_count) replaceDict['data'] = pie_data(sex_map) replace_template(template_file, replaceDict, output_file) #4、话题关注用户的地域分布 topN = 10 province_conf = root_dir +'/conf/province.list' province_list = [] province_map = {} fin = open(province_conf, 'r') index = 0 for line in fin: province = line.strip() province_list.append(province) province_map[province] = index index += 1 fin.close() source_list = ['sina'] topicid_province = {} for i in range(topN): topicid_province[sorted_result[i][0]] = [] for j in range(len(province_list)): topicid_province[sorted_result[i][0]].append(0) for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null' or fields[8] == 'null': continue for topic in fields[2].split(':'): if topic not in topicid_province: continue province_index = int(province_map[fields[8]]) topicid_province[topic][province_index] += 1 fin.close() for i in range(topN): template_file = root_dir +'/template/horizontal_bar.tpl' output_file = root_dir +'/final_html/4-'+ str(i) +'.htm' replaceDict = {} replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户地域分布'" replaceDict['subtitle'] = "''" replaceDict['x_name'] = "'相关微博或帖子条数'" replaceDict['categories'] = bar_categories(province_list) replaceDict['series'] = bar_series(topicid_province[sorted_result[i][0]]) replace_template(template_file, replaceDict, output_file) #5、话题关注用户的年龄分布 topN = 10 age_list = ['10岁以下', '10-19岁', '20-29岁', '30-39岁', '40-49岁', '50-59岁', '60岁以上'] source_list = ['sina'] topicid_age = {} for i in range(topN): topicid_age[sorted_result[i][0]] = [] for j in range(len(age_list)): topicid_age[sorted_result[i][0]].append(0) for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null' or fields[9] == 'null': continue for topic in fields[2].split(':'): if topic not in topicid_age: continue age = 2013 -int(fields[9]) if age <= 9: topicid_age[topic][0] += 1 elif age >= 10 and age <= 19: topicid_age[topic][1] += 1 elif age >= 20 and age <= 29: topicid_age[topic][2] += 1 elif age >= 30 and age <= 39: topicid_age[topic][3] += 1 elif age >= 40 and age <= 49: topicid_age[topic][4] += 1 elif age >= 50 and age <= 59: topicid_age[topic][5] += 1 elif age >= 60: topicid_age[topic][6] += 1 fin.close() for i in range(topN): template_file = root_dir +'/template/vertical_bar.tpl' output_file = root_dir +'/final_html/5-'+ str(i) +'.htm' replaceDict = {} replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户年龄分布'" replaceDict['subtitle'] = "''" replaceDict['y_name'] = "'人数'" replaceDict['categories'] = bar_categories(age_list) replaceDict['series'] = bar_series(topicid_age[sorted_result[i][0]]) replace_template(template_file, replaceDict, output_file) #6、话题来源媒体的比例 topN = 10 source_list = ['sina', 'tencent', 'tianya'] topicid_source = {} for i in range(topN): topicid_source[sorted_result[i][0]] = [] for j in range(len(source_list)): topicid_source[sorted_result[i][0]].append(0) for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null': continue for topic in fields[2].split(':'): if topic not in topicid_source: continue if source == "sina": topicid_source[topic][0] += 1 if source == "tencent": topicid_source[topic][1] += 1 if source == "tianya": topicid_source[topic][2] += 1 fin.close() for i in range(topN): template_file = root_dir +'/template/pie.tpl' output_file = root_dir +'/final_html/6-'+ str(i) +'.htm' replaceDict = {} replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 话题来源媒体分布'" source_map = {} source_map['sina'] = topicid_source[sorted_result[i][0]][0] source_map['tencent'] = topicid_source[sorted_result[i][0]][1] source_map['tianya'] = topicid_source[sorted_result[i][0]][2] replaceDict['data'] = pie_data(source_map) replace_template(template_file, replaceDict, output_file) #7、话题的核心关注用户 topN = 10 coreuser = 5 source_list = ['sina'] topicid_user = {} for i in range(topN): topicid_user[sorted_result[i][0]] = {} for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null' or fields[6] == 'null': continue userid = fields[6] for topic in fields[2].split(':'): if topic not in topicid_user: continue if userid not in topicid_user[topic]: topicid_user[topic][userid] = 1 else: topicid_user[topic][userid] += 1 fin.close() output_file = root_dir +'/final_html/topic_coreuser.list' fout = open(output_file, 'w') for i in range(topN): title = "#"+ topicid_content[sorted_result[i][0]] +"# 话题核心关注人物" print>>fout, title sorted_tmp = sorted(topicid_user[sorted_result[i][0]].items(), key = lambda d:d[1], reverse =True) if len(sorted_tmp) < coreuser: coreuser = len(sorted_tmp) for j in range(coreuser): print>>fout, " %s %s"%(sorted_tmp[j][0], sorted_tmp[j][1]) # userid and related documents count fout.close() #8、话题关注用户的粉丝数分布 topN = 10 fans_list = ['0-100', '101-1000', '1001-10000', '10001-100000', '100001-500000', '500000以上'] source_list = ['sina'] topicid_fans = {} for i in range(topN): topicid_fans[sorted_result[i][0]] = [] for j in range(len(fans_list)): topicid_fans[sorted_result[i][0]].append(0) for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null' or fields[6] == 'null' or fields[10] == 'null': continue for topic in fields[2].split(':'): if topic not in topicid_fans: continue fans = int(fields[10]) if fans <= 100: topicid_fans[topic][0] += 1 elif fans >= 101 and fans <= 1000: topicid_fans[topic][1] += 1 elif fans >= 1001 and fans <= 10000: topicid_fans[topic][2] += 1 elif fans >= 10001 and fans <= 100000: topicid_fans[topic][3] += 1 elif fans >= 100001 and fans <= 500000: topicid_fans[topic][4] += 1 elif fans >= 500001: topicid_fans[topic][5] += 1 fin.close() for i in range(topN): template_file = root_dir +'/template/horizontal_bar.tpl' output_file = root_dir +'/final_html/8-'+ str(i) +'.htm' replaceDict = {} replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户粉丝数分布'" replaceDict['subtitle'] = "''" replaceDict['x_name'] = "'粉丝数'" replaceDict['categories'] = bar_categories(fans_list) replaceDict['series'] = bar_series(topicid_fans[sorted_result[i][0]]) replace_template(template_file, replaceDict, output_file) #9、话题关注用户的微博数分布 topN = 10 weibo_list = ['0-100', '101-1000', '1001-3000', '3001-5000', '5001-10000', '10000以上'] source_list = ['sina'] topicid_weibo = {} for i in range(topN): topicid_weibo[sorted_result[i][0]] = [] for j in range(len(weibo_list)): topicid_weibo[sorted_result[i][0]].append(0) for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null' or fields[6] == 'null' or fields[11] == 'null': continue for topic in fields[2].split(':'): if topic not in topicid_weibo: continue weibo = int(fields[10]) if weibo <= 100: topicid_weibo[topic][0] += 1 elif weibo >= 101 and weibo <= 1000: topicid_weibo[topic][1] += 1 elif weibo >= 1001 and weibo <= 3000: topicid_weibo[topic][2] += 1 elif weibo >= 3001 and weibo <= 5000: topicid_weibo[topic][3] += 1 elif weibo >= 5001 and weibo <= 10000: topicid_weibo[topic][4] += 1 elif weibo >= 10001: topicid_weibo[topic][5] += 1 fin.close() for i in range(topN): template_file = root_dir +'/template/horizontal_bar.tpl' output_file = root_dir +'/final_html/9-'+ str(i) +'.htm' replaceDict = {} replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户微博数分布'" replaceDict['subtitle'] = "''" replaceDict['x_name'] = "'微博数'" replaceDict['categories'] = bar_categories(weibo_list) replaceDict['series'] = bar_series(topicid_weibo[sorted_result[i][0]]) replace_template(template_file, replaceDict, output_file) #10、关注度、传播度、活跃度 topN = 10 source_list = ['sina', 'tencent', 'tianya'] topicid_attention = {} topicid_diffuse = {} topicid_active = {} for i in range(topN): topicid_attention[sorted_result[i][0]] = set() #userlist topicid_diffuse[sorted_result[i][0]] = {} #user and fans topicid_active[sorted_result[i][0]] = 0 #comment and retweet and praise for source in source_list: input_file = root_dir +'/lda_model/'+ source +'/final.data' fin = open(input_file, 'r') for line in fin: fields = line.strip().split(' ') if fields[2] == 'null': continue for topic in fields[2].split(':'): if topic not in topicid_attention: continue if fields[6] != 'null': if fields[6] not in topicid_attention[topic]: topicid_attention[topic].add(fields[6]) if fields[10] != 'null': if fields[6] not in topicid_diffuse[topic]: topicid_diffuse[topic][fields[6]] = int(fields[10]) if fields[3] != 'null': topicid_active[topic] += int(fields[3]) if fields[4] != 'null': topicid_active[topic] += int(fields[4]) if fields[5] != 'null': topicid_active[topic] += int(fields[5]) fin.close() output_file = root_dir +'/final_html/topic_attention_diffuse_active.list' fout = open(output_file, 'w') for i in range(topN): title = "#"+ topicid_content[sorted_result[i][0]] +"# 关注度、传播度、活跃度" print>>fout, title attention = len(topicid_attention[sorted_result[i][0]]) diffuse = 0 for user in topicid_diffuse[sorted_result[i][0]]: diffuse += topicid_diffuse[sorted_result[i][0]][user] active = topicid_active[sorted_result[i][0]] print>>fout, " %s %s %s"%(attention, diffuse, active) fout.close() if __name__ == "__main__": main()