zoukankan      html  css  js  c++  java
  • CF算法

    generate_train_data.py

    import pandas as pd
    import user_cf
    import operator
    import item_cf
    
    data_path = 'G:\Bigdata_object\u.data'
    
    udata = pd.read_csv(data_path,
                        sep='	',
                        header=None,
                        names=['user_id', 'item_id', 'rating', 'timestamp'])
    
    train = dict()
    # for _,row in udata.iloc[:2,:].iterrows():
    for _, row in udata.iterrows():
        user_id = str(row['user_id'])
        item_id = str(row['item_id'])
        rating = row['rating']
        if train.get(user_id, -1) == -1:
            train[user_id] = dict()
        train[user_id][item_id] = rating
    
    # ###################user_cf test###################
    
    # # print(train)
    W = user_cf.user_similarity(train)
    # # print(sorted(W.get('1').items(), key=operator.itemgetter(1), reverse=True)[:10])
    #
    rec_item_list = user_cf.recommend('1', train, W, 10)
    print(sorted(rec_item_list.items(), key=operator.itemgetter(1), reverse=True)[:20])
    
    
    # ###################item_cf test###################
    W2 = item_cf.item_similarity(train)
    item_list = item_cf.recommend(train,'1',W2,10)
    print(sorted(item_list.items(), key=operator.itemgetter(1), reverse=True)[:20])

    item_cf.py

    import math
    import operator
    
    
    def item_similarity(train):
        # 计算item1与item2相同的user的数量
        C = dict()  # 存item与item相同user的个数 分子
        N = dict()  # item的用户数量 分母
        for u, items in train.items():
            for i in items:
                if N.get(i, -1) == -1:
                    N[i] = 0
                N[i] += 1
                if C.get(i, -1) == -1:
                    C[i] = dict()
                for j in items:
                    if i == j:
                        continue
                    elif C[i].get(j, -1) == -1:
                        C[i][j] = 0
                    C[i][j] += 1
        # 加分母计算相似度
        W = dict()
        for i, related_items in C.items():
            if W.get(i, -1) == -1:
                W[i] = dict()
            for j, cij in related_items.items():
                if W[i].get(j, -1) == -1:
                    W[i][j] = 0
                W[i][j] += cij / math.sqrt(N[i] * N[j])
        return W
    
    
    def recommend(train, user, w, k):
        rank = dict()
        ru = train[user]
        for i, pi in ru.items():
            for j, wj in sorted(w[i].items(),
                                key=operator.itemgetter(1),
                                reverse=True)[0:k]:
                if j in ru:
                    continue
                elif rank.get(j, -1) == -1:
                    rank[j] = 0
                rank[j] += pi * wj
        return rank

    user_cf.py

    import operator
    import math
    
    
    # train 格式 :{user:{item:rating}}
    
    
    def user_similarity(train):
        # 建立item->users倒排表
        item_users = dict()
        for u, items in train.items():
            for i in items.keys():
                if i not in item_users:
                    item_users[i] = set()
                item_users[i].add(u)
    
        # 计算相似user共同的物品数量
        C = dict()  # 共同用户之间相同物品的数量  交集
        N = dict()  # 存储每个用户拥有的Item数量  分母
        for i, users in item_users.items():
            for u in users:
                if N.get(u, -1) == -1:
                    N[u] = 0
                N[u] += 1
                if C.get(u, -1) == -1:
                    C[u] = dict()
                for v in users:
                    if u == v:
                        continue
                    elif C[u].get(v, -1) == -1:
                        C[u][v] = 0
                    C[u][v] += 1
                    # C[u][v] += 1 / math.log(1 + len(users))
        # 得到最终的相似度矩阵W
        W = dict()
        for u, related_users in C.items():
            if W.get(u, -1) == -1:
                W[u] = dict()
            for v, cuv in related_users.items():
                W[u][v] = cuv / math.sqrt(N[u] * N[v] * 1.0)
        return W
    
    
    # 相似用户的物品集合
    def recommend(user, train, w, k):
        rank = dict()
        interacted_items = train[user].keys()
        for v, wuv in sorted(w[user].items(),
                             key=operator.itemgetter(1),
                             reverse=True)[0:k]:
            for i, rvi in train[v].items():
                if i in interacted_items:  # 过滤已经做过评价的电影
                    continue
                elif rank.get(i, -1) == -1:
                    rank[i] = 0
                rank[i] += wuv * rvi
        return rank
  • 相关阅读:
    linux文件IO操作篇 (二) 缓冲文件
    信号量和互斥锁的区别
    linux 无锁化编程
    C语言中 time相关的函数 头文件
    printf 打印字体和背景带颜色的输出的方法
    在win10环境下安装Cygwin,可以GCC编译
    学习《大话存储》
    linux内核态和用户态的信号量
    学习Makefile
    git 环境搭建
  • 原文地址:https://www.cnblogs.com/hackerer/p/13422115.html
Copyright © 2011-2022 走看看