zoukankan      html  css  js  c++  java
  • CF算法

    generate_train_data.py

    import pandas as pd
    import user_cf
    import operator
    import item_cf
    
    data_path = 'G:\Bigdata_object\u.data'
    
    udata = pd.read_csv(data_path,
                        sep='	',
                        header=None,
                        names=['user_id', 'item_id', 'rating', 'timestamp'])
    
    train = dict()
    # for _,row in udata.iloc[:2,:].iterrows():
    for _, row in udata.iterrows():
        user_id = str(row['user_id'])
        item_id = str(row['item_id'])
        rating = row['rating']
        if train.get(user_id, -1) == -1:
            train[user_id] = dict()
        train[user_id][item_id] = rating
    
    # ###################user_cf test###################
    
    # # print(train)
    W = user_cf.user_similarity(train)
    # # print(sorted(W.get('1').items(), key=operator.itemgetter(1), reverse=True)[:10])
    #
    rec_item_list = user_cf.recommend('1', train, W, 10)
    print(sorted(rec_item_list.items(), key=operator.itemgetter(1), reverse=True)[:20])
    
    
    # ###################item_cf test###################
    W2 = item_cf.item_similarity(train)
    item_list = item_cf.recommend(train,'1',W2,10)
    print(sorted(item_list.items(), key=operator.itemgetter(1), reverse=True)[:20])

    item_cf.py

    import math
    import operator
    
    
    def item_similarity(train):
        # 计算item1与item2相同的user的数量
        C = dict()  # 存item与item相同user的个数 分子
        N = dict()  # item的用户数量 分母
        for u, items in train.items():
            for i in items:
                if N.get(i, -1) == -1:
                    N[i] = 0
                N[i] += 1
                if C.get(i, -1) == -1:
                    C[i] = dict()
                for j in items:
                    if i == j:
                        continue
                    elif C[i].get(j, -1) == -1:
                        C[i][j] = 0
                    C[i][j] += 1
        # 加分母计算相似度
        W = dict()
        for i, related_items in C.items():
            if W.get(i, -1) == -1:
                W[i] = dict()
            for j, cij in related_items.items():
                if W[i].get(j, -1) == -1:
                    W[i][j] = 0
                W[i][j] += cij / math.sqrt(N[i] * N[j])
        return W
    
    
    def recommend(train, user, w, k):
        rank = dict()
        ru = train[user]
        for i, pi in ru.items():
            for j, wj in sorted(w[i].items(),
                                key=operator.itemgetter(1),
                                reverse=True)[0:k]:
                if j in ru:
                    continue
                elif rank.get(j, -1) == -1:
                    rank[j] = 0
                rank[j] += pi * wj
        return rank

    user_cf.py

    import operator
    import math
    
    
    # train 格式 :{user:{item:rating}}
    
    
    def user_similarity(train):
        # 建立item->users倒排表
        item_users = dict()
        for u, items in train.items():
            for i in items.keys():
                if i not in item_users:
                    item_users[i] = set()
                item_users[i].add(u)
    
        # 计算相似user共同的物品数量
        C = dict()  # 共同用户之间相同物品的数量  交集
        N = dict()  # 存储每个用户拥有的Item数量  分母
        for i, users in item_users.items():
            for u in users:
                if N.get(u, -1) == -1:
                    N[u] = 0
                N[u] += 1
                if C.get(u, -1) == -1:
                    C[u] = dict()
                for v in users:
                    if u == v:
                        continue
                    elif C[u].get(v, -1) == -1:
                        C[u][v] = 0
                    C[u][v] += 1
                    # C[u][v] += 1 / math.log(1 + len(users))
        # 得到最终的相似度矩阵W
        W = dict()
        for u, related_users in C.items():
            if W.get(u, -1) == -1:
                W[u] = dict()
            for v, cuv in related_users.items():
                W[u][v] = cuv / math.sqrt(N[u] * N[v] * 1.0)
        return W
    
    
    # 相似用户的物品集合
    def recommend(user, train, w, k):
        rank = dict()
        interacted_items = train[user].keys()
        for v, wuv in sorted(w[user].items(),
                             key=operator.itemgetter(1),
                             reverse=True)[0:k]:
            for i, rvi in train[v].items():
                if i in interacted_items:  # 过滤已经做过评价的电影
                    continue
                elif rank.get(i, -1) == -1:
                    rank[i] = 0
                rank[i] += wuv * rvi
        return rank
  • 相关阅读:
    Java虚拟机(第二版) 学习笔记之Class类文件的结构
    JVM之深入浅出之垃圾收集算法
    Java虚拟机(第二版) 学习笔记之OutOfMemoryError
    Java虚拟机(第二版) 学习笔记
    平滑加权轮询负载均衡(轮询)算法
    java AQS(AbstractQueuedSynchronizer)同步器详解
    mybatis Interceptor拦截器代码详解
    aspectj编程简介
    Java并发编程阅读笔记-Java监视器模式示例
    我们该怎么结合日志做优化
  • 原文地址:https://www.cnblogs.com/hackerer/p/13422115.html
Copyright © 2011-2022 走看看