zoukankan      html  css  js  c++  java
  • 协同过滤算法简单实现

    以下的代码主要是来自《推荐系统实践》第二章节,修改了一些书上的错误,简单的实现了基于用户的协同过滤算法和基于物品的协同过滤算法,可供参考: 

    import math
    import random
    from collections import defaultdict
    from operator import itemgetter
    
    
    def user_similarity(train):
     """
     基于用户的协同过滤算法UserCF
     :param train: 训练集
     :return: 用户相似度矩阵
     """
     # build inverse table for item_users
     item_users = dict()
        for u, items in train.items():
            for i in items.keys():
                if i not in item_users:
                    item_users[i] = set()
                item_users[i].add(u)
        # calculate co-rated items between users
     c = dict()
        n = defaultdict(int)
        for i, users in item_users.items():
            for u in users:
                n[u] += 1
     for v in users:
                    if u == v:
                        continue
     c.setdefault(u, defaultdict(int))
                    c[u][v] += 1
     # calculate finial similarity matrix w
     w = dict()
        for u, related_users in c.items():
            for v, cuv in related_users.items():
                w.setdefault(u, defaultdict(int))
                w[u][v] = cuv / math.sqrt(n[u] * n[v])
        return w
    
    
    def user_similarity2(train):
     """
     基于用户的协同过滤算法UserCF-IIF,添加热门物品惩罚因子
     :param train: 训练集
     :return: 用户相似度矩阵
     """
     # build inverse table for item_users
     item_users = dict()
        for u, items in train.items():
            for i in items.keys():
                if i not in item_users:
                    item_users[i] = set()
                item_users[i].add(u)
        # calculate co-rated items between users
     c = dict()
        n = defaultdict(int)
        for i, users in item_users.items():
            for u in users:
                n[u] += 1
     for v in users:
                    if u == v:
                        continue
     c.setdefault(u, defaultdict(int))
                    # 添加热门物品惩罚因子
     c[u][v] += 1 / math.log(1 + len(users))
        # calculate finial similarity matrix w
     w = dict()
        for u, related_users in c.items():
            for v, cuv in related_users.items():
                w.setdefault(u, defaultdict(int))
                w[u][v] = cuv / math.sqrt(n[u] * n[v])
        return w
    
    
    def item_similarity(train):
     """
     基于物品的协同过滤算法ItemCF
     :param train: 训练集
     :return: 物品相似度矩阵
     """
     # calculate co-rated users between items
     c = dict()
        n = defaultdict(int)
        for users, items in train.items():
            for i in items:
                n[i] += 1
     c.setdefault(i, dict())
                for j in items:
                    if i == j:
                        continue
     c[i].setdefault(j, 0)
                    c[i][j] += 1
     # calculate finial similarity matrix w
     w = dict()
        for i, related_items in c.items():
            for j, cij in related_items.items():
                w.setdefault(i, defaultdict(float))
                w[i][j] = cij / math.sqrt(n[i] * n[j])
        return w
    
    
    def item_similarity2(train):
     """
     基于物品的协同过滤算法ItemCF-IUF,添加对活跃性用户的惩罚因子
     :param train: 训练集
     :return: 物品相似度矩阵
     """
     # calculate co-rated users between items
     c = dict()
        n = defaultdict(int)
        for users, items in train.items():
            for i in items:
                n[i] += 1
     c.setdefault(i, dict())
                for j in items:
                    if i == j:
                        continue
     c[i].setdefault(j, 0)
                    # ItemCF-IUF 添加对活跃性用户的惩罚因子
     c[i][j] += 1 / math.log(1 + len(items) * 1.0)
        # calculate finial similarity matrix w
     w = dict()
        for i, related_items in c.items():
            for j, cij in related_items.items():
                w.setdefault(i, defaultdict(float))
                w[i][j] = cij / math.sqrt(n[i] * n[j])
        return w
    
    
    def item_similarity3(train):
     """
     基于物品的协同过滤算法ItemCF-IUF,添加对活跃性用户的惩罚因子以及对相似矩阵的归一化处理
     :param train: 训练集
     :return: 物品相似度矩阵
     """
     # calculate co-rated users between items
     c = dict()
        n = defaultdict(int)
        for users, items in train.items():
            for i in items:
                n[i] += 1
     c.setdefault(i, dict())
                for j in items:
                    if i == j:
                        continue
     c[i].setdefault(j, 0)
                    # ItemCF-IUF 添加对活跃性用户的惩罚因子
     c[i][j] += 1 / math.log(1 + len(items) * 1.0)
        # calculate finial similarity matrix w
     w = dict()
        for i, related_items in c.items():
            for j, cij in related_items.items():
                w.setdefault(i, defaultdict(float))
                w[i][j] = cij / math.sqrt(n[i] * n[j])
        # 添加对相似矩阵的归一化处理
     for item in w:
            max_value = max(w[item].values())
            for items_related in w[item]:
                w[item][items_related] /= max_value
        return w
    
    
    def recommend_by_item(train, user_id, w, k):
        rank = defaultdict(float)
        ru = train[user_id]
        for i, pi in ru.items():
            for j, wj in sorted(w[i].items(), key=itemgetter(1), reverse=True)[0:k]:
                if j in ru:
                    continue
     rank[j] += pi * wj
        return rank
    
    
    def recommend_by_user(user, train, w, k):
        rank = defaultdict(float)
        interacted_items = train[user]
        for v, wuv in sorted(w[user].items(), key=itemgetter(1), reverse=True)[0:k]:
            for i, rvi in train[v].items():
                if i in interacted_items:
                    # we should filter items user interacted before
     continue
     rank[i] += wuv * rvi
        return rank
    
    
    if __name__ == '__main__':
        train = {'A': {'a': 1, 'b': 1, 'd': 1}, 'B': {'a': 1, 'c': 1},
     'C': {'b': 1, 'e': 1}, 'D': {'c': 1, 'd': 1, 'e': 1}}
        rank = recommend_by_user('A', train, user_similarity(train), 3)
        print('UserCF:', dict(rank))
        rank2 = recommend_by_user('A', train, user_similarity2(train), 3)
        print('UserCF-IIF:', dict(rank2))
    
        train2 = {'A': {'a': 1, 'b': 1, 'd': 1}, 'B': {'b': 1, 'c': 1, 'e': 1},
     'C': {'c': 1, 'd': 1}, 'D': {'b': 1, 'c': 1, 'd': 1},
     'E': {'a': 1, 'd': 1}}
        rank3 = recommend_by_item(train2, 'A', item_similarity(train2), 5)
        print('ItemCF:', dict(rank3))
        rank4 = recommend_by_item(train2, 'A', item_similarity2(train2), 5)
        print('ItemCF-IUF:', dict(rank4))
        rank5 = recommend_by_item(train2, 'A', item_similarity3(train2), 5)
        print('ItemCF-IUF+Normalization:', dict(rank5))
  • 相关阅读:
    【BZOJ】1620: [Usaco2008 Nov]Time Management 时间管理(贪心)
    【BZOJ】1651: [Usaco2006 Feb]Stall Reservations 专用牛棚(线段树/前缀和 + 差分)
    【BZOJ】1628 && 1683: [Usaco2007 Demo]City skyline 城市地平线(单调栈)
    【BZOJ】1624: [Usaco2008 Open] Clear And Present Danger 寻宝之路(floyd)
    【BZOJ】1622: [Usaco2008 Open]Word Power 名字的能量(dp/-模拟)
    【BZOJ】1634: [Usaco2007 Jan]Protecting the Flowers 护花(贪心)
    【BZOJ】1690: [Usaco2007 Dec]奶牛的旅行(分数规划+spfa)
    【BZOJ】1660: [Usaco2006 Nov]Bad Hair Day 乱发节(单调栈)
    【BZOJ】1642: [Usaco2007 Nov]Milking Time 挤奶时间(dp)
    【BZOJ】1629: [Usaco2007 Demo]Cow Acrobats(贪心+排序)
  • 原文地址:https://www.cnblogs.com/goingforward/p/10191937.html
Copyright © 2011-2022 走看看