zoukankan      html  css  js  c++  java
  • paper about spring

    一、解析用户原始信息的json文件

    #!/usr/bin/python
    # -*- coding=utf-8 -*-
    
    import os
    import sys
    
    import json
    
    def main():
    
        root_dir = sys.argv[1]
        
        province_file = root_dir +"/conf/province.list"
        fin = open(province_file, 'r')
        provinces = set()
        for line in fin:
            province = line.strip()
            provinces.add(province)
        fin.close()
    
        input_file  = root_dir +"/source_data/userinfo.json"
        output_file = root_dir +"/result_data/userinfo.data"
    
        fin = open(input_file, 'r')
        fout = open(output_file, 'w')
        for line in fin:
            if line.strip() == "[]":
                continue
            json_file = json.loads(line.strip())
            userid   = json_file['userId']
            sex      = json_file['sex']
            location = json_file['location']
            birthday = json_file['birthday']
            attentioncount = json_file['attentionCount']
            fanscount      = json_file['fansCount']
            weibocount     = json_file['weiboCount']
            label_list=json_file['labelList']
            user_introduce=json_file['userIntroduce']
            if not sex:
                sex = 'null'
            if location.find(' ') != -1:
                fields = location.split(' ')
                location = fields[0]
            elif location:
                for province in provinces:
                    if location.find(province) != -1:
                        location = province
            if not location :
                location = 'null'
            index = birthday.find('')
            if index != -1:
                birthday = birthday[0:index]
            else:
                birthday = 'null'
            if not attentioncount:
                attentioncount = '0'
            if not fanscount:
                fanscount = '0'
            if not weibocount:
                weibocount = '0'
            if not label_list or not label_list.strip():
                label_list='null'
            if not user_introduce or not user_introduce.strip():
                user_introduce='null'
            
            print>>fout, "%s	%s	%s	%s	%s	%s	%s	%s	%s"%(userid, sex, location, birthday, attentioncount, fanscount, weibocount,label_list,user_introduce)
        fin.close()
        fout.close()
    
    if __name__ == "__main__":
    
        main()
    UserInfoParser

    1,用户的标签需要进一步进行分词处理

    2,根据这份数据,打了标签的用户大概占据总用户的1/3

    3,对于没有标签的,这里使用的是null标注

    如果需要删除没有标签的记录,那么相关的shell语句为:

    cat userinfo.data | awk -F '	' '{print $8}'|sed /null/d
    cat userinfo.data | cut -f 8|sed /null/d
    cat userinfo.data | awk -F '	' {if $8!=null print $8}
    UserShell

     二、进行映射以及排序的一些操作

    #!/usr/bin/python
    
    import os
    import sys
    
    def main():
    
        root_dir = sys.argv[1]
        topN     = int(sys.argv[2])
    
        topic_total_file = root_dir +'/result_data/topic_id.data.total'
        id_topic = {}
        fin = open(topic_total_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            id_topic[fields[1]] = fields[0]
        fin.close()
    
        topicid_count = {}
        sources = ['sina', 'tencent']
        for source in sources:
            input_file = root_dir +'/result_data/'+ source +'.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == '-1':
                    continue
                topics = fields[2].split(':')
                for topic in topics:
                    if topic in topicid_count:
                        topicid_count[topic] += 1
                    else:
                        topicid_count[topic] = 1
            fin.close()
        sort_topic = sorted(topicid_count.items(), key = lambda d:d[1], reverse=True)
        if len(sort_topic) < topN:
            topN = len(sort_topic)
        output_file = root_dir +'/result_data/topic_id.data'
        fout = open(output_file, 'w')
        for i in range(topN):
            print>>fout, "%s	%s	%s"%(sort_topic[i][0], id_topic[sort_topic[i][0]], topicid_count[sort_topic[i][0]])
        fout.close()
    
    if __name__ == "__main__":
        
        main()
    TopN_topic

     1)构建两个列表,一个存储id和topic的对应关系,一个存储id和该topic出现次数的对应关系

    2)按照次数排序的时候采用sorted(dict.items(),key=lambda d:d[1],reverse=True)进行,这样排序之后得到一个元组构成的列表

    三、文档id分配、停用词处理

    #!/usr/bin/python
    
    import os
    import sys
    
    def main():
    
        if len(sys.argv) != 4:
            print "error parameters!"
            sys.exit(0)
    
        root_dir = sys.argv[1][0:sys.argv[1].rfind('/')]
        input_dir = sys.argv[1]
        output_root_dir = sys.argv[2]
        topic_multiple = float(sys.argv[3])
        
        # stopwords
        stopwords_file = root_dir +'/conf/stopwords.list'
        fin = open(stopwords_file, 'r')
        stopwords = set()
        for line in fin:
            word = line.strip()
            stopwords.add(word)
        fin.close()
    
        # generate ntopics_alpha.data
        cmd = "wc -l "+ root_dir +"/result_data/topic_id.data | awk -F' ' '{print $1}'"
        num_topics = int(int(os.popen(cmd).read().strip()) * topic_multiple)
        alpha = 50 / float(num_topics)
        ntopics_alpha_file = output_root_dir +'/ntopics_alpha.data'
        fout = open(ntopics_alpha_file, 'w')
        print>>fout, "%s	%s"%(num_topics, alpha)
        fout.close()
    
        # allocate docid and remove stopwords
        source_list = ['sina', 'tencent', 'tianya']
        for source in source_list:
            input_file = input_dir +'/'+ source +'.data'
            cmd = "wc -l "+ input_file +" | awk -F' ' '{print $1}'"
            line_number = os.popen(cmd).read().strip()
            output_file = output_root_dir +'/'+ source +'/source.data'
            fin = open(input_file, 'r')
            fout = open(output_file, 'w')
            print>>fout, line_number
            docid = {}
            allocate_id = 0
            for line in fin:
                fields = line.strip().split('	')
                doc    = fields[0]
                docid[doc] = allocate_id
                allocate_id += 1
                line = ""
                for word in fields[1].split(' '):
                    if word.strip() and word not in stopwords:
                        line += word +'	'
                if len(line) == 0:
                    print>>fout, 'null'
                else:
                    print>>fout, line
            fin.close()
            fout.close()
            docid_file = output_root_dir +'/'+ source +'/docid.map'
            fout = open(docid_file, 'w')
            for doc in docid:
                print>>fout, "%s	%s"%(doc, docid[doc])
            fout.close()
    
    if __name__ == "__main__":
    
        main()
    allocateDocId

    1)如何去除停用词

    2)如何为文档分配id,并将记录进行保存

    3)在实际情况中,是将当前的文档id,词本身作为key还是进一步处理,其实可以看情况而定。

    4, generate_nw_nd

    #!/usr/bin/python
    
    import os
    import sys
    
    def main():
    
        root_dir = sys.argv[1]
    
        cmd = "cat "+ root_dir +"/lda_model/ntopics_alpha.data | awk -F' ' '{print $1}' "
        num_topics = int(os.popen(cmd).read().strip())
    
        source_list = ['sina', 'tencent', 'tianya']
        for source in source_list:
            tassign_file = root_dir +'/lda_model/'+ source +'/model-final.tassign'
            nd_file = root_dir +'/lda_model/'+ source +'/nd.data'
            cmd = "head -1 "+ root_dir +"/lda_model/"+ source +"/wordmap.txt"
            num_tokens = int(os.popen(cmd).read().strip())
            nw = [0 for i in range(num_topics * num_tokens)]
            fin = open(tassign_file, 'r')
            fout = open(nd_file, 'w')
            docid = 0
            for line in fin:
                fields = line.strip().split(' ')
                nd = [0 for i in range(num_topics)]
                for pair in fields:
                    parts   = pair.split(':')
                    wordid  = int(parts[0])
                    topicid = int(parts[1])
                    nw[wordid*num_topics + topicid] += 1
                    nd[topicid] += 1
                print>>fout, "%s	%s"%(docid, "	".join([str(i) for i in nd]))
                docid += 1
            fin.close()
            fout.close()
            nw_file = root_dir +'/lda_model/'+ source +'/nw.data'
            fout = open(nw_file, 'w')
            for wordid in range(num_tokens):
                line = ''
                for topicid in range(num_topics):
                    line += str(nw[wordid*num_topics + topicid]) +'	'
                print>>fout, line
            fout.close()
    
    if __name__ == "__main__":
        main()
    generae_nw_nd

    1) use list to do matrix

    5,topic_mapping

    #!/usr/bin/python
    
    import os
    import sys
    
    def similarity(real_vector, lda_vector):
    
        score = float(0)
    
        words = set()
        for word in real_vector:
            if word not in words:
                words.add(word)
        for word in lda_vector:
            if word not in words:
                words.add(word)
        
        real_list = []
        lda_list = []
        for word in words:
            if word in real_vector:
                real_list.append(real_vector[word])
            else:
                real_list.append(float(0))
            if word in lda_vector:
                lda_list.append(lda_vector[word])
            else:
                lda_list.append(float(0))
        for i in range(len(real_list)):
            score += real_list[i] * lda_list[i]
    
        return score
    
    def topic_mapping(realtopic_vector, ldatopic_vector):
    
        real_lda = {}
        
        for realtopic in realtopic_vector:
            max_topic = '0'
            max_score = float(0)
            for ldatopic in ldatopic_vector:
                score = similarity(realtopic_vector[realtopic], ldatopic_vector[ldatopic])
                if score > max_score:
                    max_topic = ldatopic
                    max_score = score
            real_lda[realtopic] = max_topic
    
        return real_lda
    
    def main():
    
        root_dir = sys.argv[1]
        twords   = int(sys.argv[2])
        realtopic_words = int(sys.argv[3])
    
        source_list = ['sina', 'tencent', 'tianya']
        
        # generate vsm of real topic
        topicid_file = root_dir +"/result_data/topic_id.data"
        realtopic_vsm = {}
        fin = open(topicid_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            realtopic_vsm[fields[0]] = {}
        fin.close()
        topic_source_list = ['sina', 'tencent']
        for topic_source in topic_source_list:
            input_file = root_dir +'/result_data/'+ topic_source +'.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                topicid = fields[2]
                if topicid == '-1':
                    continue
                for topic in topicid.split(':'):
                    if topic not in realtopic_vsm:
                        continue
                    for word in fields[1].split(' '):
                        if word not in realtopic_vsm[topic]:
                            realtopic_vsm[topic][word] = 1
                        else:
                            realtopic_vsm[topic][word] += 1
            fin.close()
        # generate vector of real topic
        realtopic_vector = {}
        for topic in realtopic_vsm:
            realtopic_vector[topic] = {}
            length = realtopic_words
            sorted_tmp = sorted(realtopic_vsm[topic].items(), key = lambda d:d[1], reverse=True)
            if len(sorted_tmp) < length:
                length = len(sorted_tmp)
            sum_count = 0
            for i in range(length):
                sum_count += sorted_tmp[i][1]
            for i in range(length):
                realtopic_vector[topic][sorted_tmp[i][0]] = sorted_tmp[i][1] / float(sum_count)
    
        # mapping real topic with lda topic
        for source in source_list:
            input_file = root_dir +"/lda_model/"+ source +"/model-final.twords"
            # re-build topic vectoc
            ldatopic_vector = {}
            fin = open(input_file, 'r')
            cur_topic = "0"
            for line in fin:
                line = line.strip()
                if line.find('Topic') != -1:
                    fields = line.split(' ')
                    cur_topic = fields[1][0: fields[1].find('th')]
                    ldatopic_vector[cur_topic] = {}
                else:
                    fields = line.split('	')
                    word = fields[0]
                    weight = float(fields[1])
                    if weight > 0.0:
                        ldatopic_vector[cur_topic][word] = weight
            fin.close()
            real_lda = topic_mapping(realtopic_vector, ldatopic_vector) 
            output_file = root_dir +"/lda_model/"+ source +"/topic_mapping.data"
            fout = open(output_file, 'w')
            for realtopic in real_lda:
                print>>fout, "%s	%s"%(realtopic, real_lda[realtopic])
            fout.close()
                    
    
    if __name__ == "__main__":
    
        main()
    topic_mapping

    1)set an real_topic to lda_topic

    (the real_topic 's words are by counting;the lda_topic 's word are by training)

    2) caculate the similarity of two dictory or two vector

    6, final_data

    #!/usr/bin/python
    
    import sys
    
    def main():
        root_dir = sys.argv[1]
    
        topn = 2 # the top n topic is the real distribution of document
        source_list = ['sina', 'tencent', 'tianya']
        for source in source_list:
            allocateid_ldatopic = {} # value is a list
            input_file = root_dir +'/lda_model/'+ source +'/nd.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                allocateid =  fields[0]
                topic_distribution = {}
                for i in range(1, len(fields)-1):
                    topic_distribution[i-1] = int(fields[i])
                sorted_tmp = sorted(topic_distribution.items(), key = lambda d:d[1], reverse=True)
                allocateid_ldatopic[allocateid] = []
                for i in range(topn):
                    allocateid_ldatopic[allocateid].append(sorted_tmp[i][0])
            fin.close()
            ldatopic_realtopic = {} # value is a list
            input_file = root_dir +'/lda_model/'+ source +'/topic_mapping.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                ldatopic = fields[1]
                realtopic = fields[0]
                if ldatopic not in ldatopic_realtopic:
                    ldatopic_realtopic[ldatopic] = [realtopic]
                else:
                    ldatopic_realtopic[ldatopic].append(realtopic)
            fin.close()
            userid_profile = {}
            input_file = root_dir +'/result_data/userinfo.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                userid = fields[0]
                sex = fields[1]
                location = fields[2]
                age = fields[3]
                fanscount = fields[5]
                weibocount = fields[6]
                userid_profile[userid] = [sex, location, age, fanscount, weibocount]
            fin.close()
            docid_allocateid = {}
            input_file = root_dir +'/lda_model/'+ source +'/docid.map'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                docid_allocateid[fields[0]] = fields[1]
            fin.close()
            # final.data
            input_file = root_dir +'/result_data/'+ source +'.data'
            output_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            fout = open(output_file, 'w')
            for line in fin:
                fields = line.strip().split('	')
                docid = fields[0] 
                allocateid = docid_allocateid[docid]
                topic_set = set()
                if fields[2] != '-1':
                    for topic in fields[2].split(':'):
                        if topic in topic_set:
                            continue
                        topic_set.add(topic)
                for ldatopic in allocateid_ldatopic[allocateid]:
                    if str(ldatopic) not in ldatopic_realtopic:
                        continue
                    for topic in ldatopic_realtopic[str(ldatopic)]:
                        if topic not in topic_set:
                            topic_set.add(topic)
                if topic_set:
                    topics = ':'.join(topic_set)
                else:
                    topics = 'null'
                comment = fields[3]
                retweet = fields[4]
                praise = fields[5]
                userid = fields[6]
                if userid in userid_profile:
                    user_profile = '	'.join(userid_profile[userid])
                else:
                    user_profile = 'null	null	null	null	null'
                print>>fout, "%s	%s	%s	%s	%s	%s	%s	%s"%(docid, allocateid, topics, comment, retweet, praise, userid, user_profile)
            fin.close()
            fout.close()
    
    if __name__ == "__main__":
    
        main()
    final_data

    1) allocate each doc the top2 lda_topic

    2) totate a dict's key and value, though the key is unique ,but the same value can project different key

    3)allocate a doc some relate topic ,include it's own topic,as well as the top2 lda-projection topic

    4)merge the allocateid_ldatopic, ldatopic_realtopic, userid_profile, docid_allocateid to a single file

    7,Visualization

    #!/usr/bin/python
    # -*- coding=utf-8 -*-
    
    import sys
    from string import Template
    
    def replace_template(template_file, replaceDict, output_file):
        
        fh = open(template_file, 'r')
        content = fh.read()
        fh.close()
        content_template = Template(content)
        content_final = content_template.safe_substitute(replaceDict)
        
        fout = open(output_file, 'w')
        fout.write(content_final)
        fout.close()
    
    def bar_categories(categories_list):
        categories = "["
        for i in range(len(categories_list)):
            if i == len(categories_list)-1:
                categories += "'"+ categories_list[i] +"']"
            else:
                categories += "'"+ categories_list[i] +"',"
        return categories
    
    def bar_series(data_list):
        series = "[{ name: 'count', data: ["
        for i in range(len(data_list)):
            if i == len(data_list)-1:
                series += str(data_list[i]) +"]}]"
            else:
                series += str(data_list[i]) +","
        return series
    
    def pie_data(data_map):
        data = "["
        index = 0
        for item in data_map:
            if index == len(data_map)-1:
                data += "['"+ str(item) +"',"+ str(data_map[item]) +"]"
            else:
                data += "['"+ str(item) +"',"+ str(data_map[item]) +"],"
        data += "]"
        return data
    
    def main():
    
        root_dir = sys.argv[1]
    
        # topicid and topic's content
        topicid_content = {}
        input_file = root_dir +'/result_data/topic_id.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            topicid_content[fields[0]] = fields[1]
        fin.close()
    
        #1、话题分布
        source_list = ['sina', 'tencent', 'tianya']
        topicid_count = {} 
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic in topicid_count:
                        topicid_count[topic] += 1
                    else:
                        topicid_count[topic] = 1
        # Total topics sorted by its total count
        sorted_result = sorted(topicid_count.items(), key = lambda d:d[1], reverse=True)
        
        topN = 20
        replaceDict = {}
        replaceDict['title'] = "'话题分布'"
        replaceDict['subtitle'] = "''"
        categories_list = []
        for i in range(topN):
            categories_list.append(topicid_content[ sorted_result[i][0] ])
        replaceDict['categories'] = bar_categories(categories_list)
        
        replaceDict['x_name'] = "'相关微博或帖子条数'"
        
        data_list = []
        for i in range(topN):
            data_list.append(sorted_result[i][1])
        replaceDict['series'] = bar_series(data_list)
        
        template_file = root_dir +'/template/horizontal_bar.tpl'
        output_file = root_dir +'/final_html/1.htm'
        replace_template(template_file, replaceDict, output_file)
    
                
        
        #2、话题分布变化趋势
    
        #3、话题关注用户的男女比例
        topN = 10
        topicid_sex = {}
        for i in range(topN):
            topicid_sex[sorted_result[i][0]] = [0, 0]
        source_list = ['sina'] # we only has user profile of sina currently
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null' or fields[7] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic not in topicid_sex:
                        continue
                    if fields[7] == "":
                        topicid_sex[topic][0] += 1
                    if fields[7] == "":
                        topicid_sex[topic][1] += 1
            fin.close()
        for i in range(topN):
            template_file = root_dir +'/template/pie.tpl'
            output_file = root_dir +'/final_html/3-'+ str(i) +'.htm'
            replaceDict = {}
            replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户男女比例'"
            sum_count = topicid_sex[sorted_result[i][0]][0] + topicid_sex[sorted_result[i][0]][1]
            sex_map = {}
            sex_map[''] = topicid_sex[sorted_result[i][0]][0] / float(sum_count)
            sex_map[''] = topicid_sex[sorted_result[i][0]][1] / float(sum_count)
            replaceDict['data'] = pie_data(sex_map)
            replace_template(template_file, replaceDict, output_file)
    
    
        #4、话题关注用户的地域分布
        topN = 10
        province_conf = root_dir +'/conf/province.list'
        province_list = []
        province_map = {}
        fin = open(province_conf, 'r')
        index = 0
        for line in fin:
            province = line.strip()
            province_list.append(province)
            province_map[province] = index
            index += 1
        fin.close()
        source_list = ['sina']
        topicid_province = {}
        for i in range(topN):
            topicid_province[sorted_result[i][0]] = []
            for j in range(len(province_list)):
                topicid_province[sorted_result[i][0]].append(0)
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null' or fields[8] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic not in topicid_province:
                        continue
                    province_index = int(province_map[fields[8]])
                    topicid_province[topic][province_index] += 1
            fin.close()
        for i in range(topN):
            template_file = root_dir +'/template/horizontal_bar.tpl'
            output_file = root_dir +'/final_html/4-'+ str(i) +'.htm'
            replaceDict = {}
            replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户地域分布'"
            replaceDict['subtitle'] = "''"
            replaceDict['x_name'] = "'相关微博或帖子条数'"
            replaceDict['categories'] = bar_categories(province_list)
            replaceDict['series'] = bar_series(topicid_province[sorted_result[i][0]])
            replace_template(template_file, replaceDict, output_file)
    
    
        #5、话题关注用户的年龄分布
        topN = 10
        age_list = ['10岁以下', '10-19岁', '20-29岁', '30-39岁', '40-49岁', '50-59岁', '60岁以上']
        source_list = ['sina']
        topicid_age = {}
        for i in range(topN):
            topicid_age[sorted_result[i][0]] = []
            for j in range(len(age_list)):
                topicid_age[sorted_result[i][0]].append(0)
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null' or fields[9] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic not in topicid_age:
                        continue
                    age  = 2013 -int(fields[9])
                    if age <= 9:
                        topicid_age[topic][0] += 1
                    elif age >= 10 and age <= 19:
                        topicid_age[topic][1] += 1
                    elif age >= 20 and age <= 29:
                        topicid_age[topic][2] += 1
                    elif age >= 30 and age <= 39:
                        topicid_age[topic][3] += 1
                    elif age >= 40 and age <= 49:
                        topicid_age[topic][4] += 1
                    elif age >= 50 and age <= 59:
                        topicid_age[topic][5] += 1
                    elif age >= 60:
                        topicid_age[topic][6] += 1                    
            fin.close()
        for i in range(topN):
            template_file = root_dir +'/template/vertical_bar.tpl'
            output_file = root_dir +'/final_html/5-'+ str(i) +'.htm'
            replaceDict = {}
            replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户年龄分布'"
            replaceDict['subtitle'] = "''"
            replaceDict['y_name'] = "'人数'"
            replaceDict['categories'] = bar_categories(age_list)
            replaceDict['series'] = bar_series(topicid_age[sorted_result[i][0]])
            replace_template(template_file, replaceDict, output_file)
        
    
        #6、话题来源媒体的比例
        topN = 10
        source_list = ['sina', 'tencent', 'tianya']
        topicid_source = {}
        for i in range(topN):
            topicid_source[sorted_result[i][0]] = []
            for j in range(len(source_list)):
                topicid_source[sorted_result[i][0]].append(0)
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic not in topicid_source:
                        continue
                    if source == "sina":
                        topicid_source[topic][0] += 1
                    if source == "tencent":
                        topicid_source[topic][1] += 1
                    if source == "tianya":
                        topicid_source[topic][2] += 1
            fin.close()
        for i in range(topN):
            template_file = root_dir +'/template/pie.tpl'
            output_file = root_dir +'/final_html/6-'+ str(i) +'.htm'
            replaceDict = {}
            replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"#  话题来源媒体分布'"
            source_map = {}
            source_map['sina'] = topicid_source[sorted_result[i][0]][0]
            source_map['tencent'] = topicid_source[sorted_result[i][0]][1]
            source_map['tianya'] = topicid_source[sorted_result[i][0]][2]
            replaceDict['data'] = pie_data(source_map)
            replace_template(template_file, replaceDict, output_file)
    
        #7、话题的核心关注用户
        topN = 10
        coreuser = 5
        source_list = ['sina']
        topicid_user = {}
        for i in range(topN):
            topicid_user[sorted_result[i][0]] = {}
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null' or fields[6] == 'null':
                    continue
                userid = fields[6]
                for topic in fields[2].split(':'):
                    if topic not in topicid_user:
                        continue
                    if userid not in topicid_user[topic]:
                        topicid_user[topic][userid] = 1
                    else:
                        topicid_user[topic][userid] += 1
            fin.close()
        output_file = root_dir +'/final_html/topic_coreuser.list'
        fout = open(output_file, 'w')
        for i in range(topN):
            title = "#"+ topicid_content[sorted_result[i][0]] +"#  话题核心关注人物"
            print>>fout, title
            sorted_tmp = sorted(topicid_user[sorted_result[i][0]].items(), key = lambda d:d[1], reverse =True)
            if len(sorted_tmp) < coreuser:
                coreuser = len(sorted_tmp)
            for j in range(coreuser):
                print>>fout, "	%s	%s"%(sorted_tmp[j][0], sorted_tmp[j][1]) # userid and related documents count
        fout.close()
    
        #8、话题关注用户的粉丝数分布
        topN = 10
        fans_list = ['0-100', '101-1000', '1001-10000', '10001-100000', '100001-500000', '500000以上']
        source_list = ['sina']
        topicid_fans = {}
        for i in range(topN):
            topicid_fans[sorted_result[i][0]] = []
            for j in range(len(fans_list)):
                topicid_fans[sorted_result[i][0]].append(0)
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null' or fields[6] == 'null' or fields[10] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic not in topicid_fans:
                        continue
                    fans = int(fields[10])
                    if fans <= 100:
                        topicid_fans[topic][0] += 1
                    elif fans >= 101 and fans <= 1000:
                        topicid_fans[topic][1] += 1
                    elif fans >= 1001 and fans <= 10000:
                        topicid_fans[topic][2] += 1
                    elif fans >= 10001 and fans <= 100000:
                        topicid_fans[topic][3] += 1
                    elif fans >= 100001 and fans <= 500000:
                        topicid_fans[topic][4] += 1
                    elif fans >= 500001:
                        topicid_fans[topic][5] += 1
            fin.close()
        for i in range(topN):
            template_file = root_dir +'/template/horizontal_bar.tpl'
            output_file = root_dir +'/final_html/8-'+ str(i) +'.htm'
            replaceDict = {}
            replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户粉丝数分布'"
            replaceDict['subtitle'] = "''"
            replaceDict['x_name'] = "'粉丝数'"
            replaceDict['categories'] = bar_categories(fans_list)
            replaceDict['series'] = bar_series(topicid_fans[sorted_result[i][0]])
            replace_template(template_file, replaceDict, output_file)
    
        #9、话题关注用户的微博数分布
        topN = 10
        weibo_list = ['0-100', '101-1000', '1001-3000', '3001-5000', '5001-10000', '10000以上']
        source_list = ['sina']
        topicid_weibo = {}
        for i in range(topN):
            topicid_weibo[sorted_result[i][0]] = []
            for j in range(len(weibo_list)):
                topicid_weibo[sorted_result[i][0]].append(0)
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null' or fields[6] == 'null' or fields[11] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic not in topicid_weibo:
                        continue
                    weibo = int(fields[10])
                    if weibo <= 100:
                        topicid_weibo[topic][0] += 1
                    elif weibo >= 101 and weibo <= 1000:
                        topicid_weibo[topic][1] += 1
                    elif weibo >= 1001 and weibo <= 3000:
                        topicid_weibo[topic][2] += 1
                    elif weibo >= 3001 and weibo <= 5000:
                        topicid_weibo[topic][3] += 1
                    elif weibo >= 5001 and weibo <= 10000:
                        topicid_weibo[topic][4] += 1
                    elif weibo >= 10001:
                        topicid_weibo[topic][5] += 1
            fin.close()
        for i in range(topN):
            template_file = root_dir +'/template/horizontal_bar.tpl'
            output_file = root_dir +'/final_html/9-'+ str(i) +'.htm'
            replaceDict = {}
            replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户微博数分布'"
            replaceDict['subtitle'] = "''"
            replaceDict['x_name'] = "'微博数'"
            replaceDict['categories'] = bar_categories(weibo_list)
            replaceDict['series'] = bar_series(topicid_weibo[sorted_result[i][0]])
            replace_template(template_file, replaceDict, output_file)
    
        #10、关注度、传播度、活跃度
        topN = 10
        source_list = ['sina', 'tencent', 'tianya']
        topicid_attention = {}
        topicid_diffuse   = {}
        topicid_active    = {}
        for i in range(topN):
            topicid_attention[sorted_result[i][0]] = set() #userlist
            topicid_diffuse[sorted_result[i][0]]   = {} #user and fans
            topicid_active[sorted_result[i][0]]    = 0 #comment and retweet and praise
        for source in source_list:
            input_file = root_dir +'/lda_model/'+ source +'/final.data'
            fin = open(input_file, 'r')
            for line in fin:
                fields = line.strip().split('	')
                if fields[2] == 'null':
                    continue
                for topic in fields[2].split(':'):
                    if topic not in topicid_attention:
                        continue
                    if fields[6] != 'null':
                        if fields[6] not in topicid_attention[topic]:
                            topicid_attention[topic].add(fields[6])
                        if fields[10] != 'null':
                            if fields[6] not in topicid_diffuse[topic]:
                                topicid_diffuse[topic][fields[6]] = int(fields[10])
                    if fields[3] != 'null':
                        topicid_active[topic] += int(fields[3])
                    if fields[4] != 'null':
                        topicid_active[topic] += int(fields[4])
                    if fields[5] != 'null':
                        topicid_active[topic] += int(fields[5])
            fin.close()
        output_file = root_dir +'/final_html/topic_attention_diffuse_active.list'
        fout = open(output_file, 'w')
        for i in range(topN):
            title = "#"+ topicid_content[sorted_result[i][0]] +"#  关注度、传播度、活跃度"
            print>>fout, title
            attention = len(topicid_attention[sorted_result[i][0]])
            diffuse = 0 
            for user in topicid_diffuse[sorted_result[i][0]]:
                diffuse += topicid_diffuse[sorted_result[i][0]][user]
            active = topicid_active[sorted_result[i][0]]
            print>>fout, "	%s	%s	%s"%(attention, diffuse, active)
        fout.close()
        
    
    if __name__ == "__main__":
        main()
    visualization
  • 相关阅读:
    Java实验项目六——使用DAO模式实现对职工表的操作
    Java实验项目三——职工类对象数组按照职工生日排序
    Java实验项目三——编程实现Person类,学生类的设计及其继承关系
    Java实验项目三——平面图形和立体图形抽象类
    javax.naming.NoInitialContextException:Need to specify class name in environment or system property, or as an applet parameter, or in an application resource file: java.naming.factory.initial
    对实体 "useSSL" 的引用必须以 ';' 分隔符结尾。
    MATLAB自定义配置
    POJ 2976 Dropping tests(最大化平均值 or 01整数规划)
    POJ3273 Monthly Expense
    POJ3258 River Hopscotch
  • 原文地址:https://www.cnblogs.com/bobodeboke/p/3500924.html
Copyright © 2011-2022 走看看