zoukankan      html  css  js  c++  java
  • 相似度与距离计算python代码实现

     1 #定义几种距离计算函数
     2 #更高效的方式为把得分向量化之后使用scipy中定义的distance方法
     3 
     4 from math import sqrt
     5 def euclidean_dis(rating1, rating2):    #欧式距离计算
     6     """计算2个打分序列间的欧式距离. 输入的rating1和rating2都是打分dict
     7        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
     8     distance = 0
     9     commonRatings = False 
    10     for key in rating1:
    11         if key in rating2:
    12             distance += (rating1[key] - rating2[key])^2
    13             commonRatings = True
    14     #两个打分序列之间有公共打分电影
    15     if commonRatings:
    16         return distance
    17     #无公共打分电影
    18     else:
    19         return -1
    20 
    21 
    22 def manhattan_dis(rating1, rating2):    #曼哈顿距离计算
    23     """计算2个打分序列间的曼哈顿距离. 输入的rating1和rating2都是打分dict
    24        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
    25     distance = 0
    26     commonRatings = False 
    27     for key in rating1:
    28         if key in rating2:
    29             distance += abs(rating1[key] - rating2[key])
    30             commonRatings = True
    31     #两个打分序列之间有公共打分电影
    32     if commonRatings:
    33         return distance
    34     #无公共打分电影
    35     else:
    36         return -1
    37 
    38 def cos_dis(rating1, rating2):   #余弦相似度计算
    39     """计算2个打分序列间的cos距离. 输入的rating1和rating2都是打分dict
    40        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
    41     distance = 0
    42     dot_product_1 = 0
    43     dot_product_2 = 0
    44     commonRatings = False
    45     
    46     for score in rating1.values():
    47         dot_product_1 += score^2
    48     for score in rating2.values():
    49         dot_product_2 += score^2
    50         
    51     for key in rating1:
    52         if key in rating2:
    53             distance += rating1[key] * rating2[key]
    54             commonRatings = True
    55     #两个打分序列之间有公共打分电影
    56     if commonRatings:
    57         return 1-distance/sqrt(dot_product_1*dot_product_2)
    58     #无公共打分电影
    59     else:
    60         return -1
    61 
    62 def pearson_dis(rating1, rating2):  #皮尔逊相似度计算
    63     """计算2个打分序列间的pearson距离. 输入的rating1和rating2都是打分dict
    64        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
    65     sum_xy = 0
    66     sum_x = 0
    67     sum_y = 0
    68     sum_x2 = 0
    69     sum_y2 = 0
    70     n = 0
    71     for key in rating1:
    72         if key in rating2:
    73             n += 1
    74             x = rating1[key]
    75             y = rating2[key]
    76             sum_xy += x * y
    77             sum_x += x
    78             sum_y += y
    79             sum_x2 += pow(x, 2)
    80             sum_y2 += pow(y, 2)
    81     # now compute denominator
    82     denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
    83     if denominator == 0:
    84         return 0
    85     else:
    86         return (sum_xy - (sum_x * sum_y) / n) / denominator
  • 相关阅读:
    zabbix 二 zabbix agent 客户端
    zabbix (一:zabbix服务端)
    ssh ip "WARING:REMOTE HOST IDENTIFICATION HAS CHANGED!"
    undefined reference to libiconv_open'
    linux 远程 windows 命令:rdesktop vs windows mstsc
    源码安装mysql
    svn 结合rsync 的代码发布系统
    svn !
    使用rsync 的 --delete参数删除目标目录比源目录多余的文件
    svn 提交冲突(目录下删除文件)
  • 原文地址:https://www.cnblogs.com/luozeng/p/8635444.html
Copyright © 2011-2022 走看看