import pandas as pd import pprint import numpy as np users = ['user1','user2','user3','user4','user5',] items = ['item A','item B','item C','item D','item E',] datasets = [ [1,0,1,1,0], [1,0,0,1,1], [1,0,1,0,0], [0,1,0,1,1], [1,1,1,0,1] ] df = pd.DataFrame(datasets,columns= items,index= users) print(df) from sklearn.metrics import jaccard_similarity_score,jaccard_score # sim = jaccard_similarity_score(df['item A'],df['item B']) sim = jaccard_score(df['item A'],df['item B']) print(sim) from sklearn.metrics.pairwise import pairwise_distances user_similar = 1 - pairwise_distances(df.values,metric='jaccard') # print(user_similar) user_similar = pd.DataFrame(user_similar,columns=users,index=users) print(user_similar) items_similar = 1 - pairwise_distances(df.T.values,metric='jaccard') items_similar = pd.DataFrame(items_similar,columns=items,index=items) print(items_similar) topN_items = {} # print(user_similar.index) for i in items_similar.index: _df = items_similar.loc[i].drop([i]) print(_df) _df_sorted = _df.sort_values(ascending=False) top2 = list(_df_sorted.index[:2]) topN_items[i] = top2 print('Top 2 相似物品:') print(topN_items) rs_results = {} for user in df.index: rs_result = set() for item in df.loc[user].replace(0,np.nan).dropna().index: #取出每个用户当前已经购买物品列表 #根据每个物品 找出最相似的TOP_N物品,构建初始推荐结果 rs_result = rs_result.union(topN_items[item]) #过滤掉用户自己已经购买的物品 rs_result -= set(df.loc[user].replace(0,np.nan).dropna().index) rs_results[user] = rs_result print('最终推荐结果:') print(rs_results)
最终推荐结果:
{'user1': {'item B', 'item E'}, 'user2': {'item C', 'item B'}, 'user3': {'item B', 'item E'}, 'user4': {'item A'}, 'user5': {'item D'}}