zoukankan      html  css  js  c++  java
  • simrank python实现

    1、数据

    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    View Code

    2、simrank 的python实现

    import numpy as np 
    from numpy import matrix
    
    with open('sample1 (1).txt','r') as log_fp:
        logs = [log.strip() for log in log_fp.readlines()]
        # print(logs)
    logs_tuple = [tuple(log.split(",")) for log in logs]
    # print (logs_tuple)
    
    queries = list(set([log[0] for log in logs_tuple]))
    # print(queries)    #['digital camera', 'flower', 'pc', 'camera', 'tv']
    ads = list(set([log[1] for log in logs_tuple]))
    # print(ads)#['hp.com', 'teleflora.com', 'bestbuy.com', 'orchids.com']
    
    graph = np.matrix(np.zeros([len(queries),len(ads)]))
    # print(graph)   #6行4列的0矩阵
    
    for log in logs_tuple:
        query = log[0]
        ad = log[1]
        q_i = queries.index(query)
        a_j = ads.index(ad)
        graph[q_i,a_j] +=1
    print(graph)
    
    query_sim = matrix(np.identity(len(queries)))
    print(query_sim)
    ad_sim = matrix(np.identity(len(ads)))
    print(ad_sim)
    
    def get_ads_num(query):
        q_i = queries.index(query)
        return graph[q_i]
    
    def get_queries_num(ad):
        a_j = ads.index(ad)
        return graph.transpose()[a_j]
    
    def get_ads(query):
        series = get_ads_num(query).tolist()[0]
        return [ads[x] for x in range(len(series)) if series[x] > 0]
    
    def get_queries(ad):
        series = get_queries_num(ad).tolist()[0]
        return [queries[x] for x in range(len(series)) if series[x] > 0]
    
    
    def query_simrank(q1,q2,c):
        if q1 == q2 :
            return 1
        prefix = c/(get_ads_num(q1).sum() *get_ads_num(q2).sum())
        postfix = 0
        for ad_i in get_ads(q1):
            for ad_j in get_ads(q2):
                i = ads.index(ad_i)
                j = ads.index(ad_j)
                postfix += ad_sim[i,j]
        return prefix*postfix
    
    
    def ad_simrank(a1,a2,c):
        if a1 == a2 :
            return 1
        prefix = c/(get_queries_num(a1).sum()*get_queries_num(a2).sum())
        postfix = 0
        for query_i in get_queries(a1):
            for query_j in get_queries(a2):
                i = queries.index(query_i)
                j = queries.index(query_j)
                postfix += query_sim[i,j]
        return prefix*postfix
    
    
    def simrank(c=0.8,times = 1):
        global query_sim,ad_sim
    
        for run in range(times):
            new_query_sim = matrix(np.identity(len(queries)))
            for qi in queries:
                for qj in queries:
                    i = queries.index(qi)
                    j = queries.index(qj)
                    new_query_sim[i,j] =query_simrank(qi,qj,c)
    
            new_ad_sim = matrix(np.identity(len(ads)))
            for ai in ads:
                for aj in ads :
                    i = ads.index(ai)
                    j = ads.index(aj)
                    new_ad_sim[i,j] =ad_simrank(ai,aj,c)
    
            query_sim = new_query_sim
            ad_sim = new_ad_sim
    
    
    if __name__ == '__main__':
        print (queries)
        print(ads)
        simrank()
        print(query_sim)
        print(ad_sim)

    [[15.  0.  0.  0.]
     [ 0.  0. 10.  0.]
     [ 5.  0. 20.  0.]
     [ 7.  0. 30.  0.]
     [ 0. 16.  0. 15.]]
    [[
    1. 0. 0. 0. 0.] [0. 1. 0. 0. 0.] [0. 0. 1. 0. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 0. 1.]]
    [[
    1. 0. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 0. 0. 1.]]
    [
    'tv', 'pc', 'camera', 'digital camera', 'flower']
    [
    'bestbuy.com', 'teleflora.com', 'hp.com', 'orchids.com']
    [[
    1. 0. 0.00213333 0.00144144 0. ] [0. 1. 0.0032 0.00216216 0. ] [0.00213333 0.0032 1. 0.00172973 0. ] [0.00144144 0.00216216 0.00172973 1. 0. ] [0. 0. 0. 0. 1. ]]
    [[
    1.00000000e+00 0.00000000e+00 9.87654321e-04 0.00000000e+00] [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.33333333e-03] [9.87654321e-04 0.00000000e+00 1.00000000e+00 0.00000000e+00] [0.00000000e+00 3.33333333e-03 0.00000000e+00 1.00000000e+00]]
  • 相关阅读:
    嵌入式工程师C语言面试常见的0x10个问题
    C语言初学者网站推荐
    strlen和sizeof
    基于Docker搭建GitLab和Maven私服
    linux暴露端口可以被外部访问
    MySQL新增用户及赋予权限
    Docker添加域名解析
    Netstat 网络命令详解
    Mysql索引太长导致同步数据结构失败解决方法
    完美解决Cannot download "https://github.com/sass/node-sass/releases/download/binding.nod的问题
  • 原文地址:https://www.cnblogs.com/spp666/p/11821700.html
Copyright © 2011-2022 走看看