zoukankan      html  css  js  c++  java
  • simrank python实现

    1、数据

    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    View Code

    2、simrank 的python实现

    import numpy as np 
    from numpy import matrix
    
    with open('sample1 (1).txt','r') as log_fp:
        logs = [log.strip() for log in log_fp.readlines()]
        # print(logs)
    logs_tuple = [tuple(log.split(",")) for log in logs]
    # print (logs_tuple)
    
    queries = list(set([log[0] for log in logs_tuple]))
    # print(queries)    #['digital camera', 'flower', 'pc', 'camera', 'tv']
    ads = list(set([log[1] for log in logs_tuple]))
    # print(ads)#['hp.com', 'teleflora.com', 'bestbuy.com', 'orchids.com']
    
    graph = np.matrix(np.zeros([len(queries),len(ads)]))
    # print(graph)   #6行4列的0矩阵
    
    for log in logs_tuple:
        query = log[0]
        ad = log[1]
        q_i = queries.index(query)
        a_j = ads.index(ad)
        graph[q_i,a_j] +=1
    print(graph)
    
    query_sim = matrix(np.identity(len(queries)))
    print(query_sim)
    ad_sim = matrix(np.identity(len(ads)))
    print(ad_sim)
    
    def get_ads_num(query):
        q_i = queries.index(query)
        return graph[q_i]
    
    def get_queries_num(ad):
        a_j = ads.index(ad)
        return graph.transpose()[a_j]
    
    def get_ads(query):
        series = get_ads_num(query).tolist()[0]
        return [ads[x] for x in range(len(series)) if series[x] > 0]
    
    def get_queries(ad):
        series = get_queries_num(ad).tolist()[0]
        return [queries[x] for x in range(len(series)) if series[x] > 0]
    
    
    def query_simrank(q1,q2,c):
        if q1 == q2 :
            return 1
        prefix = c/(get_ads_num(q1).sum() *get_ads_num(q2).sum())
        postfix = 0
        for ad_i in get_ads(q1):
            for ad_j in get_ads(q2):
                i = ads.index(ad_i)
                j = ads.index(ad_j)
                postfix += ad_sim[i,j]
        return prefix*postfix
    
    
    def ad_simrank(a1,a2,c):
        if a1 == a2 :
            return 1
        prefix = c/(get_queries_num(a1).sum()*get_queries_num(a2).sum())
        postfix = 0
        for query_i in get_queries(a1):
            for query_j in get_queries(a2):
                i = queries.index(query_i)
                j = queries.index(query_j)
                postfix += query_sim[i,j]
        return prefix*postfix
    
    
    def simrank(c=0.8,times = 1):
        global query_sim,ad_sim
    
        for run in range(times):
            new_query_sim = matrix(np.identity(len(queries)))
            for qi in queries:
                for qj in queries:
                    i = queries.index(qi)
                    j = queries.index(qj)
                    new_query_sim[i,j] =query_simrank(qi,qj,c)
    
            new_ad_sim = matrix(np.identity(len(ads)))
            for ai in ads:
                for aj in ads :
                    i = ads.index(ai)
                    j = ads.index(aj)
                    new_ad_sim[i,j] =ad_simrank(ai,aj,c)
    
            query_sim = new_query_sim
            ad_sim = new_ad_sim
    
    
    if __name__ == '__main__':
        print (queries)
        print(ads)
        simrank()
        print(query_sim)
        print(ad_sim)

    [[15.  0.  0.  0.]
     [ 0.  0. 10.  0.]
     [ 5.  0. 20.  0.]
     [ 7.  0. 30.  0.]
     [ 0. 16.  0. 15.]]
    [[
    1. 0. 0. 0. 0.] [0. 1. 0. 0. 0.] [0. 0. 1. 0. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 0. 1.]]
    [[
    1. 0. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 0. 0. 1.]]
    [
    'tv', 'pc', 'camera', 'digital camera', 'flower']
    [
    'bestbuy.com', 'teleflora.com', 'hp.com', 'orchids.com']
    [[
    1. 0. 0.00213333 0.00144144 0. ] [0. 1. 0.0032 0.00216216 0. ] [0.00213333 0.0032 1. 0.00172973 0. ] [0.00144144 0.00216216 0.00172973 1. 0. ] [0. 0. 0. 0. 1. ]]
    [[
    1.00000000e+00 0.00000000e+00 9.87654321e-04 0.00000000e+00] [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.33333333e-03] [9.87654321e-04 0.00000000e+00 1.00000000e+00 0.00000000e+00] [0.00000000e+00 3.33333333e-03 0.00000000e+00 1.00000000e+00]]
  • 相关阅读:
    Ubuntu下配置Viw Tab键缩进格数
    Windows文件传给Ubuntu出现乱码问题
    [Luogu] P4948 数列求和
    [Luogu]P1286 两数之和
    [Luogu]P2717 寒假作业
    [Luogu]P5110 块速递推
    Breeze魔兽编程交流论坛
    #pragma 预处理指令详解
    Api Hook 细析(一)
    CMS(网站内容管理系统)有哪些?
  • 原文地址:https://www.cnblogs.com/spp666/p/11821700.html
Copyright © 2011-2022 走看看