zoukankan      html  css  js  c++  java
  • simrank python实现

    1、数据

    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    View Code

    2、simrank 的python实现

    import numpy as np 
    from numpy import matrix
    
    with open('sample1 (1).txt','r') as log_fp:
        logs = [log.strip() for log in log_fp.readlines()]
        # print(logs)
    logs_tuple = [tuple(log.split(",")) for log in logs]
    # print (logs_tuple)
    
    queries = list(set([log[0] for log in logs_tuple]))
    # print(queries)    #['digital camera', 'flower', 'pc', 'camera', 'tv']
    ads = list(set([log[1] for log in logs_tuple]))
    # print(ads)#['hp.com', 'teleflora.com', 'bestbuy.com', 'orchids.com']
    
    graph = np.matrix(np.zeros([len(queries),len(ads)]))
    # print(graph)   #6行4列的0矩阵
    
    for log in logs_tuple:
        query = log[0]
        ad = log[1]
        q_i = queries.index(query)
        a_j = ads.index(ad)
        graph[q_i,a_j] +=1
    print(graph)
    
    query_sim = matrix(np.identity(len(queries)))
    print(query_sim)
    ad_sim = matrix(np.identity(len(ads)))
    print(ad_sim)
    
    def get_ads_num(query):
        q_i = queries.index(query)
        return graph[q_i]
    
    def get_queries_num(ad):
        a_j = ads.index(ad)
        return graph.transpose()[a_j]
    
    def get_ads(query):
        series = get_ads_num(query).tolist()[0]
        return [ads[x] for x in range(len(series)) if series[x] > 0]
    
    def get_queries(ad):
        series = get_queries_num(ad).tolist()[0]
        return [queries[x] for x in range(len(series)) if series[x] > 0]
    
    
    def query_simrank(q1,q2,c):
        if q1 == q2 :
            return 1
        prefix = c/(get_ads_num(q1).sum() *get_ads_num(q2).sum())
        postfix = 0
        for ad_i in get_ads(q1):
            for ad_j in get_ads(q2):
                i = ads.index(ad_i)
                j = ads.index(ad_j)
                postfix += ad_sim[i,j]
        return prefix*postfix
    
    
    def ad_simrank(a1,a2,c):
        if a1 == a2 :
            return 1
        prefix = c/(get_queries_num(a1).sum()*get_queries_num(a2).sum())
        postfix = 0
        for query_i in get_queries(a1):
            for query_j in get_queries(a2):
                i = queries.index(query_i)
                j = queries.index(query_j)
                postfix += query_sim[i,j]
        return prefix*postfix
    
    
    def simrank(c=0.8,times = 1):
        global query_sim,ad_sim
    
        for run in range(times):
            new_query_sim = matrix(np.identity(len(queries)))
            for qi in queries:
                for qj in queries:
                    i = queries.index(qi)
                    j = queries.index(qj)
                    new_query_sim[i,j] =query_simrank(qi,qj,c)
    
            new_ad_sim = matrix(np.identity(len(ads)))
            for ai in ads:
                for aj in ads :
                    i = ads.index(ai)
                    j = ads.index(aj)
                    new_ad_sim[i,j] =ad_simrank(ai,aj,c)
    
            query_sim = new_query_sim
            ad_sim = new_ad_sim
    
    
    if __name__ == '__main__':
        print (queries)
        print(ads)
        simrank()
        print(query_sim)
        print(ad_sim)

    [[15.  0.  0.  0.]
     [ 0.  0. 10.  0.]
     [ 5.  0. 20.  0.]
     [ 7.  0. 30.  0.]
     [ 0. 16.  0. 15.]]
    [[
    1. 0. 0. 0. 0.] [0. 1. 0. 0. 0.] [0. 0. 1. 0. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 0. 1.]]
    [[
    1. 0. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 0. 0. 1.]]
    [
    'tv', 'pc', 'camera', 'digital camera', 'flower']
    [
    'bestbuy.com', 'teleflora.com', 'hp.com', 'orchids.com']
    [[
    1. 0. 0.00213333 0.00144144 0. ] [0. 1. 0.0032 0.00216216 0. ] [0.00213333 0.0032 1. 0.00172973 0. ] [0.00144144 0.00216216 0.00172973 1. 0. ] [0. 0. 0. 0. 1. ]]
    [[
    1.00000000e+00 0.00000000e+00 9.87654321e-04 0.00000000e+00] [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.33333333e-03] [9.87654321e-04 0.00000000e+00 1.00000000e+00 0.00000000e+00] [0.00000000e+00 3.33333333e-03 0.00000000e+00 1.00000000e+00]]
  • 相关阅读:
    Linux下守护进程的编程实现(转)
    gcc 中-I、 -L 与-l选项的作用
    va_list 、va_start、 va_arg、 va_end 使用说明
    sizeof('a')在C与C++中的区别
    Qt主要类简介
    linux命令
    linux文件编程
    linux一些基本命令
    Linux主机SSH免密设置解析
    javaweb添加拦截器
  • 原文地址:https://www.cnblogs.com/spp666/p/11821700.html
Copyright © 2011-2022 走看看