zoukankan      html  css  js  c++  java
  • 转:TopN推荐系统——推荐的实现与推荐效果的评价指标

    转自:用户推荐系统_python 代码-豆瓣
    书籍:项亮的<推荐系统实践>

    import random
    import math
    
    class UserBasedCF:
        def __init__(self,train = None,test = None):
            self.trainfile = train
            self.testfile = test
            self.readData()
            
        def readData(self,train = None,test = None):
            self.trainfile = train or self.trainfile
            self.testfile = test or self.testfile
            self.traindata = {}
            self.testdata = {}
            for line in open(self.trainfile):
                userid,itemid,record,_ = line.split()
                self.traindata.setdefault(userid,{})
                self.traindata[userid][itemid]=record
            for line in open(self.testfile):
                userid,itemid,record,_ = line.split()
                self.testdata.setdefault(userid,{})
                self.testdata[userid][itemid]=record
                    
        def userSimilarityBest(self,train = None):
            train = train or self.traindata
            self.userSimBest = dict()
            item_users = dict()
            for u,item in train.items():
                for i in item.keys():
                    item_users.setdefault(i,set())
                    item_users[i].add(u)
            user_item_count = dict()
            count = dict()
            for item,users in item_users.items():
                for u in users:
                    user_item_count.setdefault(u,0)
                    user_item_count[u] += 1
                    for v in users:
                        if u == v:continue
                        count.setdefault(u,{})
                        count[u].setdefault(v,0)
                        count[u][v] += 1
            for u ,related_users in count.items():
                self.userSimBest.setdefault(u,dict())
                for v, cuv in related_users.items():
                    self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0)
    
        def recommend(self,user,train = None,k = 8,nitem = 40):
            train = train or self.traindata
            rank = dict()
            interacted_items = train.get(user,{})
            for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]:#获取与user相似度最高的k个用户
                for i , rvi in train[v].items():
                    if i in interacted_items:
                        continue #只选择user没有评分过的物品进行推荐
                    rank.setdefault(i,0)#设置初始值,以便做下面的累加运算
                    rank[i] += wuv #书中为rank[i] +=rvi*wuv
            return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem])#用sorted方法对推荐的物品进行排序,预计评分高的排在前面,再取其中nitem个,nitem为每个用户推荐的物品数量
        
        def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10):
            train = train or self.traindata
            test = test or self.testdata
            hit = 0
            recall = 0
            precision = 0
            for user in train.keys():
                tu = test.get(user,{})#如果测试集中没有这个用户,则将tu初始化为空,避免test[user]报错
                rank = self.recommend(user, train = train,k = k,nitem = nitem)
                for item,_ in rank.items():
                    if item in tu:
                        hit += 1
                recall += len(tu)
                precision += nitem
            return (hit / (recall * 1.0),hit / (precision * 1.0))
        
        def coverage(self,train = None,test = None,k = 8,nitem = 10):
            train = train or self.traindata
            test = test or self.testdata
            recommend_items = set()
            all_items = set()
            for user in train.keys():
                for item in train[user].keys():
                    all_items.add(item)
                rank = self.recommend(user, train, k = k, nitem = nitem)
                for item,_ in rank.items():
                    recommend_items.add(item)
            return len(recommend_items) / (len(all_items) * 1.0)
        
        def popularity(self,train = None,test = None,k = 8,nitem = 10):
            train = train or self.traindata
            test = test or self.testdata
            item_popularity = dict()
            for user ,items in train.items():
                for item in items.keys():
                    item_popularity.setdefault(item,0)
                    item_popularity[item] += 1
            ret = 0
            n = 0
            for user in train.keys():
                rank = self.recommend(user, train, k = k, nitem = nitem)
                for item ,_ in rank.items():
                    ret += math.log(1+item_popularity[item])
                    n += 1
            return ret / (n * 1.0)
        
            
    def testUserBasedCF():
        train = 'u1.base'
        test = 'u1.test'
        cf = UserBasedCF(train,test)
        cf.userSimilarityBest()
        print("%3s%20s%20s%20s%20s" % ('K',"precision",'recall','coverage','popularity'))
        for k in [5,10,20,40,80,160]:
            recall,precision = cf.recallAndPrecision( k = k)
            coverage = cf.coverage(k = k)
            popularity = cf.popularity(k = k)
            print("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,precision * 100,recall * 100,coverage * 100,popularity))
            
    if __name__ == "__main__":
        testUserBasedCF()
    
    基于项目的推荐系统,IBCF:
    
    '''
    Created on 2013-10-10
    
    @author: Administrator
    '''
    import random
    import math
    
    class KNN:
        def __init__(self,train = None,test = None):
            self.trainfile = train
            self.testfile = test
            self.readData()
            
        def readData(self,train = None,test = None):
            self.trainfile = train or self.trainfile
            self.testfile = test or self.testfile
            self.traindata = {}
            self.testdata = {}
            for line in open(self.trainfile):
                userid,itemid,record,_ = line.split()
                self.traindata.setdefault(userid,{})
                self.traindata[userid][itemid]=record
            for line in open(self.testfile):
                userid,itemid,record,_ = line.split()
                self.testdata.setdefault(userid,{})
                self.testdata[userid][itemid]=record
                    
                    
        def ItemSim(self,train = None):
            train = train or self.traindata
            ItemSimcount = dict()
            Item_count = dict()
            for _,items in train.items():
                for itemidi in items.keys():
                    Item_count.setdefault(itemidi,0)
                    Item_count[itemidi] += 1
                    for itemidj in items.keys():
                        if itemidi == itemidj:
                            continue
                        ItemSimcount.setdefault(itemidi,{})
                        ItemSimcount[itemidi].setdefault(itemidj,0)
                        ItemSimcount[itemidi][itemidj] +=1
            self.ItemSimlist = dict()
            for itemidi, related_item in ItemSimcount.items():
                self.ItemSimlist.setdefault(itemidi,{})
                for itemidj,wij in related_item.items():
                    self.ItemSimlist[itemidi].setdefault(itemidj,0)
                    self.ItemSimlist[itemidi][itemidj] = wij/math.sqrt(Item_count[itemidi]*Item_count[itemidj]*1.0)
    
        def recommend(self,user,train = None,k = 5,nitem = 10):
            train = train or self.traindata
            recommendlist = dict()
            User_Itemlist = train.get(user,{})
            for i,ri in User_Itemlist.items():
                for j,wij in sorted(self.ItemSimlist[i].items(),key = lambda x:x[1],reverse = True)[0:k]:
                    if j in User_Itemlist:
                        continue
                    recommendlist.setdefault(j,0)
                    recommendlist[j] += float(ri)*wij
            return dict(sorted(recommendlist.items(),key = lambda x :x[1],reverse = True)[0:nitem])
        
        def recallAndPrecision(self,train = None,test = None,k = 5,nitem = 10):
            train = train or self.traindata
            test = test or self.testdata
            hit = 0
            recall = 0
            precision = 0
            for user in train.keys():
                tu = test.get(user,{})
                rank = self.recommend(user, train = train,k = k,nitem = nitem)
                for item,_ in rank.items():
                    if item in tu:
                        hit += 1
                recall += len(tu)
                precision += nitem
            return (hit / (recall * 1.0),hit / (precision * 1.0))
        
        def coverage(self,train = None,test = None,k = 5,nitem = 10):
            train = train or self.traindata
            test = test or self.testdata
            recommend_items = set()
            all_items = set()
            for user in train.keys():
                for item in train[user].keys():
                    all_items.add(item)
                rank = self.recommend(user, train, k = k, nitem = nitem)
                for item,_ in rank.items():
                    recommend_items.add(item)
            return len(recommend_items) / (len(all_items) * 1.0)
        
        def popularity(self,train = None,test = None,k = 5,nitem = 10):
            train = train or self.traindata
            test = test or self.testdata
            item_popularity = dict()
            for user ,items in train.items():
                for item in items.keys():
                    item_popularity.setdefault(item,0)
                    item_popularity[item] += 1
            ret = 0
            n = 0
            for user in train.keys():
                rank = self.recommend(user, train, k = k, nitem = nitem)
                for item ,_ in rank.items():
                    if item in item_popularity:
                        ret += math.log(1+item_popularity[item])
                        n += 1
            return ret / (n * 1.0)
        
            
    def testKNNCF():
        train = 'u1.base'
        test = 'u1.test'
        cf = KNN(train,test)
        cf.ItemSim()
        print("%3s%20s%20s%20s%20s" % ('K',"precision",'recall','coverage','popularity'))
        for k in [5,10,20,40,80,160]:
            recall,precision = cf.recallAndPrecision( k = k)
            coverage = cf.coverage(k = k)
            popularity = cf.popularity(k = k)
            print("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,precision * 100,recall * 100,coverage * 100,popularity))
            
    if __name__ == "__main__":
        testKNNCF()
  • 相关阅读:
    古老当时兴
    购买代购的产品算违法吗——看空姐代购被判刑有感
    七种方法让你的网站在搜索结果中突围而出(中)
    amf webgame
    游戏开发协议(转)
    array的排序
    用 javascript + actionScript 解决透明的flash在firefox下滚轮失效的问题!(转)
    知乎摘 励志回答
    ie:stagewidth,stageheight的bug
    jsfl bug解决
  • 原文地址:https://www.cnblogs.com/aaronhoo/p/5849715.html
Copyright © 2011-2022 走看看