zoukankan      html  css  js  c++  java
  • 统计中国,美国,世界排名前50的关键词并进行比较

    1 获取中国所有关键词

    import pymysql
    import json
    
    conn= pymysql.connect(
            host='localhost',
            port = 3306,
            user='root',
            passwd='',
            db ='python',
            )
    cursor = conn.cursor()
    
    sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like '%china%' && union_kwd_str != ''"
    a = cursor.execute(sql)
    print a
    b = cursor.fetchmany(a)  #b has 7887 abstract list
    
    abstract_list = []
    pmc_id_dict= {}
    
    for j in range(a):
        abstract_list.append(b[j][0])
        pmc_id_dict[j] = b[j][1]
    
    
    
    def output_to_json(data,filename):
        with open(filename,'w') as file:
            file.write(json.dumps(data))
            file.close()
        return json.dumps(data)
    
    output_data = {
            'country': "china",
            'count': a,
            'keyword': abstract_list
        }
    output_to_json(output_data, '1203_china_kwd.json')

    选出排名前50的关键词

    import re  
    import collections  
    import json
    
    def input_from_json(filename):
        with open(filename,'r') as file:
            data = json.loads(file.read())
            file.close()
            return data
    
    def count_word(path):  
        result = {}
        keyword_list = input_from_json(path)['keyword']  
        for all_the_text in keyword_list:
            for word in all_the_text.split(','): 
                if word not in result:  
                    result[word] = 0  
                result[word] += 1                 
        return result
      
     
    def sort_by_count(d):  
    
        d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))  
        return d  
    
     
    if __name__ == '__main__':  
        file_name = "1203_china_kwd.json"  
        fobj1 = open('1204_top50_china_kwd_list.json','w')
        fobj2 = open('1203_top15_china_kwd.json','w')
     
        dword = count_word(file_name)  
        dword = sort_by_count(dword)  
          
        jsonlist = []
        num = 0
    
        top_china_kwd_list = []
        for key,value in dword.items():
            num += 1
            key = re.sub("_", " ", key)
            data = {
            'name': key,
            'value': value
            }
            json_data = json.dumps(data)
    
            if num < 50:
                top_china_kwd_list.append(key)
                fobj2.write(json_data)
                fobj2.write(',')
            if num == 50:
                top_china_kwd_list.append(key)
                fobj2.write(json_data)
                
        data = {
        'china_kwd':top_china_kwd_list
        }
        json_data = json.dumps(data)
        fobj1.write(json_data)

    2.获取美国的所有关键词,并做统计,与中国的统计代码相似,下一步工作是整合代码。

    import pymysql
    import json
    
    conn= pymysql.connect(
            host='localhost',
            port = 3306,
            user='root',
            passwd='',
            db ='python',
            )
    cursor = conn.cursor()
    
    sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like '%USA%' && union_kwd_str != ''"
    a = cursor.execute(sql)
    print a
    b = cursor.fetchmany(a)  #b has 7887 abstract list
    
    abstract_list = []
    pmc_id_dict= {}
    
    for j in range(a):
        abstract_list.append(b[j][0])
        pmc_id_dict[j] = b[j][1]
    
    
    
    def output_to_json(data,filename):
        with open(filename,'w') as file:
            file.write(json.dumps(data))
            file.close()
        return json.dumps(data)
    
    output_data = {
            'country': "USA",
            'count': a,
            'keyword': abstract_list
        }
    output_to_json(output_data, '1204_USA_kwd.json')

    美国前50的关键词

    import re  
    import collections  
    import json
    
    def input_from_json(filename):
        with open(filename,'r') as file:
            data = json.loads(file.read())
            file.close()
            return data
    
    def count_word(path):  
        result = {}
        keyword_list = input_from_json(path)['keyword']  
        for all_the_text in keyword_list:
            for word in all_the_text.split(','): 
                if word not in result:  
                    result[word] = 0  
                result[word] += 1                 
        return result
      
     
    def sort_by_count(d):  
    
        d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))  
        return d  
    
     
    if __name__ == '__main__':  
        file_name = "1204_USA_kwd.json"  
        fobj1 = open('1204_top50_USA_kwd_list.json','w')
        fobj2 = open('1204_top50_USA_kwd.json','w')
     
        dword = count_word(file_name)  
        dword = sort_by_count(dword)  
          
        jsonlist = []
        num = 0
    
        top_USA_kwd_list = []
        for key,value in dword.items():
            num += 1
            key = re.sub("_", " ", key)
            data = {
            'name': key,
            'value': value
            }
            json_data = json.dumps(data)
    
            if num < 50:
                top_USA_kwd_list.append(key)
                fobj2.write(json_data)
                fobj2.write(',')
            if num == 50:
                top_USA_kwd_list.append(key)
                fobj2.write(json_data)
                
        data = {
        'USA_kwd':top_USA_kwd_list
        }
        json_data = json.dumps(data)
        fobj1.write(json_data)

    3,世界的前50的关键词

    import pymysql
    import json
    
    conn= pymysql.connect(
            host='localhost',
            port = 3306,
            user='root',
            passwd='',
            db ='python',
            )
    cursor = conn.cursor()
    
    sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where  union_kwd_str != ''"
    a = cursor.execute(sql)
    print a
    b = cursor.fetchmany(a)  #b has 7887 abstract list
    
    abstract_list = []
    pmc_id_dict= {}
    
    for j in range(a):
        abstract_list.append(b[j][0])
        pmc_id_dict[j] = b[j][1]
    
    
    
    def output_to_json(data,filename):
        with open(filename,'w') as file:
            file.write(json.dumps(data))
            file.close()
        return json.dumps(data)
    
    output_data = {
            'country': "world",
            'count': a,
            'keyword': abstract_list
        }
    output_to_json(output_data, '1203_world_kwd.json')

    世界前50关键词

    import re  
    import collections  
    import json
    
    def input_from_json(filename):
        with open(filename,'r') as file:
            data = json.loads(file.read())
            file.close()
            return data
    
    def count_word(path):  
        result = {}
        keyword_list = input_from_json(path)['keyword']  
        for all_the_text in keyword_list:
            for word in all_the_text.split(','): 
                if word not in result:  
                    result[word] = 0  
                result[word] += 1                 
        return result
      
     
    def sort_by_count(d):  
    
        d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))  
        return d  
    
     
    if __name__ == '__main__':  
        file_name = "1203_world_kwd.json"  
        fobj1 = open('1204_top50_world_kwd_list.json','w')
        fobj2 = open('1203_top15_world_kwd.json','w')
     
        dword = count_word(file_name)  
        dword = sort_by_count(dword)  
          
        jsonlist = []
        num = 0
        
        top_world_kwd_list = []
        for key,value in dword.items():
            num += 1
            key = re.sub("_", " ", key)
            data = {
            'name': key,
            'value': value
            }
            json_data = json.dumps(data)
    
            if num < 50:
                top_world_kwd_list.append(key)
                fobj2.write(json_data)
                fobj2.write(',')
            if num == 50:
                top_world_kwd_list.append(key)
                fobj2.write(json_data)
        
        data = {
        'world_kwd':top_world_kwd_list
        }
        json_data = json.dumps(data)
        fobj1.write(json_data)

    4.比较中国与美国的关键词有哪些相似的,以及中国与世界的研究热点有哪些相似的

    import json
    
    
    def input_from_json(filename):
        with open(filename,'r') as file:
            data = json.loads(file.read())
            file.close()
            return data
    
    china_path = '1204_top50_china_kwd_list.json'
    world_path =  '1204_top50_world_kwd_list.json'
    USA_path =  '1204_top50_USA_kwd_list.json'
    china_kwd_list = input_from_json(china_path)['china_kwd']
    world_kwd_list = input_from_json(world_path)['world_kwd']
    USA_kwd_list = input_from_json(USA_path)['USA_kwd']
    
    a = set(china_kwd_list)
    b = set(world_kwd_list)
    c = set(USA_kwd_list)
    
    china_world_same_kwd =list(a&b)
    for kwd in china_world_same_kwd:
        kwd = kwd.encode('utf-8')
        print kwd
    
    print len(china_world_same_kwd)
    
    print '
    '
    
    china_USA_same_kwd =list(a&c)
    for kwd in china_USA_same_kwd:
        kwd = kwd.encode('utf-8')
        print kwd
    
    print len(china_world_same_kwd)
  • 相关阅读:
    翻转单词顺序序列
    左旋转字符串
    查找第一个只出现一次的字符
    C语言函数与程序结构
    C语言实现快速排序法(分治法)
    C语言binsearch,shellsort,insertsort
    c语言的类型、运算符与表达式
    进程和线程
    CMS和G1
    Python IO编程
  • 原文地址:https://www.cnblogs.com/lovely7/p/6178853.html
Copyright © 2011-2022 走看看