zoukankan      html  css  js  c++  java
  • 数据挖掘之协同过滤

    # coding:utf-8
    __author__ = 'similarface'
    #datalink=http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip
    '''
    BX-Users["User-ID";"Location";"Age"]
    BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"]
    BX-Book-Ratings["User-ID";"ISBN";"Book-Rating"]
    '''
    
    #专门用作编码转换
    import codecs, os, sys
    from math import sqrt
    
    users = {
        "Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0,
                     "Slightly Stoopid": 1.5,
                     "The Strokes": 2.5, "Vampire Weekend": 2.0},
        "Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5,
                 "Vampire Weekend": 3.0},
        "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
                 "Slightly Stoopid": 1.0},
        "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5,
                "The Strokes": 4.0, "Vampire Weekend": 2.0},
        "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
        "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5,
                   "The Strokes": 4.0, "Vampire Weekend": 4.0},
        "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0,
                "The Strokes": 5.0},
        "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
    }
    
    
    class recommender:
        def __init__(self, data, k=1, metric='pearson', n=5):
            self.k = k
            self.n = n
            self.username2id = {}
            self.userid2name = {}
            self.productid2name = {}
            self.metric = metric
            if self.metric == 'pearson':
                self.fn = self.pearson
            if type(data).__name__ == 'dict':
                self.data = data
    
        def loadBookDB(self, path=''):
            self.data = {}
            i = 0
            #读取用户评分书籍的数据
            f = codecs.open(os.path.join(path, 'BX-Book-Ratings.csv'), 'r', 'utf-8',errors='ignore')
            for line in f:
                i = i + 1
                fields = line.split(';')
                user = fields[0].strip('"')
                book = fields[1].strip('"')
                try:
                    rating = int(fields[2].strip().strip('"'))
                except ValueError:
                    continue
                if user in self.data:
                    currentRatings = self.data[user]
                else:
                    currentRatings = {}
                currentRatings[book] = rating
                self.data[user] = currentRatings
            f.close()
            #读取书籍的信息
            f = codecs.open(os.path.join(path, 'BX-Books.csv'), 'r', 'utf8',errors='ignore')
            for line in f:
                i += 1
                fields = line.split(';')
                #BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"]
                isbn = fields[0].strip('"')
                title = fields[1].strip('"')
                author = fields[2].strip('"')
                title = title + 'by' + author
                self.productid2name[isbn] = title
            f.close()
    
            #读取用户的信息
            f = codecs.open(os.path.join(path, 'BX-Users.csv'), 'r', 'utf8',errors='ignore')
            for line in f:
                i += 1
                fields = line.split(';')
                userid = fields[0].strip('"')
                location = fields[1].strip('"')
                if len(fields) > 3:
                    age = fields[2].strip().strip('"')
                else:
                    age = 'NULL'
                if age != 'NULL':
                    value = location + ' (age: ' + age + ')'
                else:
                    value = location
                self.userid2name[userid] = value
                self.username2id[location] = userid
            f.close()
            print(i)
    
    
        def pearson(self, rating1, rating2):
            '''
            皮尔逊相关参数
            在统计学中,皮尔逊积矩相关系数
            (英语:Pearson product-moment correlation coefficient,
            又称作 PPMCC或PCCs[1],
            文章中常用r或Pearson's r表示)
            用于度量两个变量X和Y之间的相关(线性相关),其值介于-1与1之间。
            在自然科学领域中,该系数广泛用于度量两个变量之间的相关程度。
            0.8-1.0 极强相关
            0.6-0.8 强相关
            0.4-0.6 中等程度相关
            0.2-0.4 弱相关
            0.0-0.2 极弱相关或无相关
            '''
            sum_xy, sum_x, sum_y, sum_x2, sum_y2, n = 0, 0, 0, 0, 0, 0
            for key in rating1:
                if key in rating2:
                    n = n + 1
                    x = rating1[key]
                    y = rating2[key]
                    sum_xy += x * y
                    sum_x += x
                    sum_y += y
                    sum_x2 += x ** 2
                    sum_y2 += y ** 2
            if n == 0:
                return 0
            fenmu = sqrt(sum_x2 - (sum_x ** 2) / n) * sqrt(sum_y2 - (sum_y ** 2) / n)
            if fenmu == 0:
                return 0
            else:
                return (sum_xy - (sum_x * sum_y) / n) / fenmu
    
    
        def computeNearesNeighbor(self, username):
            '''
            计算关系系数
            '''
            distinces = []
            for instance in self.data:
                if instance != username:
                    #相关系数
                    distince = self.fn(self.data[username], self.data[instance])
                    distinces.append((instance, distince))
            distinces.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
            return distinces
    
        def recommend(self, user):
            recommendations = {}
            nearest = self.computeNearesNeighbor(user)
            userRating = self.data[user]
            totalDistance = 0.0
            for i in range(self.k):
                totalDistance += nearest[i][1]
            for i in range(self.k):
                weight = nearest[i][1] / totalDistance
                name = nearest[i][0]
                neighborRatings = self.data[name]
                #遍历相关性高的用户喜欢的书籍
                for artist in neighborRatings:
                    #如果喜欢的书不在推荐用户的书籍中
                    if not artist in userRating:
                        #文章是否存在评级
                        if artist not in recommendations:
                            recommendations[artist] = (neighborRatings[artist] * weight)
                        else:
                            recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight)
            recommendations = list(recommendations.items())
            recommendations = [(self.convertProductID2name(k), v) for (k, v) in recommendations]
            recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
            return recommendations[:self.n]
    
        def convertProductID2name(self, id):
            '''
            给定商品编号返回商品名称
            '''
            if id in self.productid2name:
                return self.productid2name[id]
            else:
                return id
    
        def userRatings(self, id, n):
            '''
            返回前n条的与用户id相关的
            :param id:
            :param n:
            :return:
            '''
            print("Ratings for " + self.userid2name[id])
            ratings = self.data[id]
            print(len(ratings))
            ratings = list(ratings.items())
            ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings]
            ratings.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
            ratings = ratings[:n]
            for rating in ratings:
                print("%s	%i" % (rating[0], rating[1]))
    
    
    if __name__ == '__main__':
        r = recommender(users)
        print(r.recommend('Veronica'))
        r.loadBookDB(u'D:/360安全浏览器下载/BX-CSV-Dump')
        print(r.recommend('276737'))
    

     

    #result:
    [('Blues Traveler', 5.0)] 1700021 [(u"Devil's Waltz (Alex Delaware Novels (Paperback))byJonathan Kellerman", 9.0), (u'Silent Partner (Alex Delaware Novels (Paperback))byJonathan Kellerman', 8.0), (u'The Outsiders (Now in Speak!)byS. E. Hinton', 8.0), (u'Sein LanguagebyJERRY SEINFELD', 8.0), (u'The Girl Who Loved Tom GordonbyStephen King', 8.0)]
  • 相关阅读:
    数组系列教材 (二)- Java 如何初始化数组
    数组系列教材 (一)- Java 如何创建一个数组
    数组系列教材 (一)- Java 如何创建一个数组
    JAVA 面试题
    JAVA 面试题
    JAVA 面试题
    HelloWorld系列(五)- 在Eclipse中运行第一个 java 程序
    [LeetCode] 142. Linked List Cycle II
    [LeetCode] 141. Linked List Cycle
    [LeetCode] 82. Remove Duplicates from Sorted List II
  • 原文地址:https://www.cnblogs.com/similarface/p/5361783.html
Copyright © 2011-2022 走看看