zoukankan      html  css  js  c++  java
  • 协同过滤算法简单实现

    以下的代码主要是来自《推荐系统实践》第二章节,修改了一些书上的错误,简单的实现了基于用户的协同过滤算法和基于物品的协同过滤算法,可供参考: 

    import math
    import random
    from collections import defaultdict
    from operator import itemgetter
    
    
    def user_similarity(train):
     """
     基于用户的协同过滤算法UserCF
     :param train: 训练集
     :return: 用户相似度矩阵
     """
     # build inverse table for item_users
     item_users = dict()
        for u, items in train.items():
            for i in items.keys():
                if i not in item_users:
                    item_users[i] = set()
                item_users[i].add(u)
        # calculate co-rated items between users
     c = dict()
        n = defaultdict(int)
        for i, users in item_users.items():
            for u in users:
                n[u] += 1
     for v in users:
                    if u == v:
                        continue
     c.setdefault(u, defaultdict(int))
                    c[u][v] += 1
     # calculate finial similarity matrix w
     w = dict()
        for u, related_users in c.items():
            for v, cuv in related_users.items():
                w.setdefault(u, defaultdict(int))
                w[u][v] = cuv / math.sqrt(n[u] * n[v])
        return w
    
    
    def user_similarity2(train):
     """
     基于用户的协同过滤算法UserCF-IIF,添加热门物品惩罚因子
     :param train: 训练集
     :return: 用户相似度矩阵
     """
     # build inverse table for item_users
     item_users = dict()
        for u, items in train.items():
            for i in items.keys():
                if i not in item_users:
                    item_users[i] = set()
                item_users[i].add(u)
        # calculate co-rated items between users
     c = dict()
        n = defaultdict(int)
        for i, users in item_users.items():
            for u in users:
                n[u] += 1
     for v in users:
                    if u == v:
                        continue
     c.setdefault(u, defaultdict(int))
                    # 添加热门物品惩罚因子
     c[u][v] += 1 / math.log(1 + len(users))
        # calculate finial similarity matrix w
     w = dict()
        for u, related_users in c.items():
            for v, cuv in related_users.items():
                w.setdefault(u, defaultdict(int))
                w[u][v] = cuv / math.sqrt(n[u] * n[v])
        return w
    
    
    def item_similarity(train):
     """
     基于物品的协同过滤算法ItemCF
     :param train: 训练集
     :return: 物品相似度矩阵
     """
     # calculate co-rated users between items
     c = dict()
        n = defaultdict(int)
        for users, items in train.items():
            for i in items:
                n[i] += 1
     c.setdefault(i, dict())
                for j in items:
                    if i == j:
                        continue
     c[i].setdefault(j, 0)
                    c[i][j] += 1
     # calculate finial similarity matrix w
     w = dict()
        for i, related_items in c.items():
            for j, cij in related_items.items():
                w.setdefault(i, defaultdict(float))
                w[i][j] = cij / math.sqrt(n[i] * n[j])
        return w
    
    
    def item_similarity2(train):
     """
     基于物品的协同过滤算法ItemCF-IUF,添加对活跃性用户的惩罚因子
     :param train: 训练集
     :return: 物品相似度矩阵
     """
     # calculate co-rated users between items
     c = dict()
        n = defaultdict(int)
        for users, items in train.items():
            for i in items:
                n[i] += 1
     c.setdefault(i, dict())
                for j in items:
                    if i == j:
                        continue
     c[i].setdefault(j, 0)
                    # ItemCF-IUF 添加对活跃性用户的惩罚因子
     c[i][j] += 1 / math.log(1 + len(items) * 1.0)
        # calculate finial similarity matrix w
     w = dict()
        for i, related_items in c.items():
            for j, cij in related_items.items():
                w.setdefault(i, defaultdict(float))
                w[i][j] = cij / math.sqrt(n[i] * n[j])
        return w
    
    
    def item_similarity3(train):
     """
     基于物品的协同过滤算法ItemCF-IUF,添加对活跃性用户的惩罚因子以及对相似矩阵的归一化处理
     :param train: 训练集
     :return: 物品相似度矩阵
     """
     # calculate co-rated users between items
     c = dict()
        n = defaultdict(int)
        for users, items in train.items():
            for i in items:
                n[i] += 1
     c.setdefault(i, dict())
                for j in items:
                    if i == j:
                        continue
     c[i].setdefault(j, 0)
                    # ItemCF-IUF 添加对活跃性用户的惩罚因子
     c[i][j] += 1 / math.log(1 + len(items) * 1.0)
        # calculate finial similarity matrix w
     w = dict()
        for i, related_items in c.items():
            for j, cij in related_items.items():
                w.setdefault(i, defaultdict(float))
                w[i][j] = cij / math.sqrt(n[i] * n[j])
        # 添加对相似矩阵的归一化处理
     for item in w:
            max_value = max(w[item].values())
            for items_related in w[item]:
                w[item][items_related] /= max_value
        return w
    
    
    def recommend_by_item(train, user_id, w, k):
        rank = defaultdict(float)
        ru = train[user_id]
        for i, pi in ru.items():
            for j, wj in sorted(w[i].items(), key=itemgetter(1), reverse=True)[0:k]:
                if j in ru:
                    continue
     rank[j] += pi * wj
        return rank
    
    
    def recommend_by_user(user, train, w, k):
        rank = defaultdict(float)
        interacted_items = train[user]
        for v, wuv in sorted(w[user].items(), key=itemgetter(1), reverse=True)[0:k]:
            for i, rvi in train[v].items():
                if i in interacted_items:
                    # we should filter items user interacted before
     continue
     rank[i] += wuv * rvi
        return rank
    
    
    if __name__ == '__main__':
        train = {'A': {'a': 1, 'b': 1, 'd': 1}, 'B': {'a': 1, 'c': 1},
     'C': {'b': 1, 'e': 1}, 'D': {'c': 1, 'd': 1, 'e': 1}}
        rank = recommend_by_user('A', train, user_similarity(train), 3)
        print('UserCF:', dict(rank))
        rank2 = recommend_by_user('A', train, user_similarity2(train), 3)
        print('UserCF-IIF:', dict(rank2))
    
        train2 = {'A': {'a': 1, 'b': 1, 'd': 1}, 'B': {'b': 1, 'c': 1, 'e': 1},
     'C': {'c': 1, 'd': 1}, 'D': {'b': 1, 'c': 1, 'd': 1},
     'E': {'a': 1, 'd': 1}}
        rank3 = recommend_by_item(train2, 'A', item_similarity(train2), 5)
        print('ItemCF:', dict(rank3))
        rank4 = recommend_by_item(train2, 'A', item_similarity2(train2), 5)
        print('ItemCF-IUF:', dict(rank4))
        rank5 = recommend_by_item(train2, 'A', item_similarity3(train2), 5)
        print('ItemCF-IUF+Normalization:', dict(rank5))
  • 相关阅读:
    mongodb
    python中读取文件的read、readline、readlines方法区别
    uva 129 Krypton Factor
    hdu 4734
    hdu 5182 PM2.5
    hdu 5179 beautiful number
    hdu 5178 pairs
    hdu 5176 The Experience of Love
    hdu 5175 Misaki's Kiss again
    hdu 5174 Ferries Wheel
  • 原文地址:https://www.cnblogs.com/goingforward/p/10191937.html
Copyright © 2011-2022 走看看