zoukankan      html  css  js  c++  java
  • 基于用户相似性的协同过滤——Python实现

    代码基本来自项亮的<推荐系统实践>,把书上的伪代码具体实现,还参考了https://www.douban.com/note/336280497/

    还可以加入对用户相似性的归一化操作,效果会更好。

    数据集为MovieLens的10万条数据.
    链接:MoiveLens

    #coding:utf-8
    import random,math
    from operator import itemgetter
    
    class UserBasedCF:
        def __init__(self,trainDataFile=None,testDataFile=None,splitor='	'):
            if trainDataFile!=None:
                self.train=self.loadData(trainDataFile, splitor)
            if testDataFile!=None:
                self.test=self.loadData(testDataFile, splitor)
            self.simiMatrix={}
            
        def setData(self,train,test):
            self.train=train
            self.test=test
                     
        def loadData(self,dataFile,splitor='	'):
            data={}
            for line in open(dataFile):
                user,item,record,_ = line.split()
                data.setdefault(user,{})
                data[user][item]=record
            return data
        
        def recallAndPrecision(self,peersCount,topN=10):
            hit=0
            recall=0
            precision=0
            for user in self.train.keys():
                itemOfuser=self.test.get(user,{})
                recItems=self.recommend(user,peersCount,topN)
                for item,pui in recItems.items():
                    if item in itemOfuser:
                        hit+=1
                recall+=len(itemOfuser)
                precision+=topN
            #print 'Recall:%s    hit:%s    allRatings:%s'%(hit/(recall*1.0),hit,precision)
            return (hit / (recall * 1.0),hit / (precision * 1.0))    
    
        def coverage(self,peersCount,topN=10):
            recommend_items=set()
            all_items=set()
            for user in self.train.keys():
                for item in self.train[user].keys():
                    all_items.add(item)
                rank=self.recommend(user,peersCount,topN)
                for item,pui in rank.items():
                    recommend_items.add(item)
            return len(recommend_items)/(len(all_items)*1.0)  
    
        def popularity(self,peersCount,topN=10):
            item_popularity=dict()
            for user,items in self.train.items():
                for item in items.keys():
                    if item not in item_popularity:
                        item_popularity[item]=1
                    item_popularity[item]+=1
            ret=0
            n=0
            for user in self.train.keys():
                rank=self.recommend(user,peersCount,topN)
                for item,pui in rank.items():
                    ret+=math.log(1+item_popularity[item])
                    n+=1
            return ret/(n*1.0)
        
        def calUserSimilarity(self):
            item_users=dict()
            for u,ratings in self.train.items():
                for i in ratings.keys():
                    item_users.setdefault(i,set())
                    item_users[i].add(u)
                    
            #calculate co-rated items between users
            coRatedCount=dict()
            itemCountOfUser=dict()
            for item,users in item_users.items():
                for u in users:
                    itemCountOfUser.setdefault(u,0)
                    itemCountOfUser[u]+=1
                    for v in users:
                        if u==v:
                            continue
                        coRatedCount.setdefault(u,{})
                        coRatedCount[u].setdefault(v,0)
                        coRatedCount[u][v]+=1/math.log(1+len(users))
            userSimiMatrix=dict()
            for u,related_users in coRatedCount.items():
                userSimiMatrix.setdefault(u,{})
                for v,cuv in related_users.items():
                    userSimiMatrix[u][v]=cuv/math.sqrt(itemCountOfUser[u]*itemCountOfUser[v])
            self.simiMatrix=userSimiMatrix
                  
        def recommend(self,userU,peersCount,topN=10):
            recItems=dict()
            interacted_items=self.train[userU]
            '''prepare the user similarity matrix first'''
            if not self.simiMatrix:
                self.calUserSimilarity()
            for userV,simiUV in sorted(self.simiMatrix[userU].items(),key=itemgetter(1),reverse=True)[0:peersCount]:
                for item,ratingV4I in self.train[userV].items():
                    if item in interacted_items:
                        continue
                    if item not in recItems:
                        recItems[item]=0
                    recItems[item]+=simiUV*float(ratingV4I)#transform 4 stars into score 0.8
                    
                    '''if len(recItems)==topN:
                        return recItems'''
            return dict(sorted(recItems.items(),key = lambda x :x[1],reverse = True)[0:topN])
    
    def testUserBasedCF():
        cf=UserBasedCF(trainDataFile=r'E:ResearchAndPapersDataSetml-100ku3.base',testDataFile=r'E:ResearchAndPapersDataSetml-100ku3.test')
        #cf.calUserSimilarity()
        print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))
        for k in [5,10,20,40,80,160]:
            recall,precision = cf.recallAndPrecision(peersCount = k)
            coverage = cf.coverage(peersCount = k)
            popularity = cf.popularity(peersCount = k)
            print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))
    
    def SplitData(wholeData,M,k,seed,splitor='	'):
            test={}
            train={}
            random.seed(seed)
        
            for line in wholeData:
                user,item,score,time=line.strip().split(splitor)
                if random.randint(0,M)==k:
                    test.setdefault(user,{})
                    test[user][item]=score
                else:
                    train.setdefault(user,{})
                    train[user][item]=score
            return train,test
        
    def testUserBasedCF2():
        wholeData=open(r'E:ResearchAndPapersDataSetml-1m
    atings.dat')
        train,test=SplitData(wholeData, 8, 5, 10, splitor='::')
        cf=UserBasedCF()
        cf.setData(train, test)
        #cf=UserBasedCF(trainDataFile=r'E:ResearchAndPapersDataSetml-100ku5.base',testDataFile=r'E:ResearchAndPapersDataSetml-100ku5.test')
        #cf.calUserSimilarity()
        print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))
        for k in [5,10,20,40,80,160]:
            recall,precision = cf.recallAndPrecision(peersCount = k)
            coverage = cf.coverage(peersCount = k)
            popularity = cf.popularity(peersCount = k)
            print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))
            
    if __name__=="__main__":
        testUserBasedCF()
        #testUserBasedCF2()
        
  • 相关阅读:
    【Demo 0011】多媒体播放器
    【Demo 0010】事件响应链
    【Demo 0009】表视图控制器
    【Demo 0008】标签控制器
    【Demo 0007】导航控制器
    【Demo 0006】iOS常用控件
    【Demo 0005】视图控制器
    【Demo 0004】屏幕、窗体及视图基础知识
    2019.7.16考试反思
    内网 可怜与超市题解 树形dp+合并
  • 原文地址:https://www.cnblogs.com/aaronhoo/p/5851200.html
Copyright © 2011-2022 走看看