zoukankan      html  css  js  c++  java
  • 使用tensorflow构造隐语义模型的推荐系统

    先创建一个reader.py,后面的程序将用到其中的函数。

    from __future__ import absolute_import, division, print_function
    import numpy as np
    import pandas as pd
    
    
    def read_file(filname, sep="	"):
        col_names = ["user", "item", "rate", "st"]
        df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
        df["user"] -= 1
        df["item"] -= 1
        for col in ("user", "item"):
            df[col] = df[col].astype(np.int32)
        df["rate"] = df["rate"].astype(np.float32)
        return df
    
    
    class ShuffleIterator(object):
        """
        Randomly generate batches
        """
        def __init__(self, inputs, batch_size=10):
            self.inputs = inputs
            self.batch_size = batch_size
            self.num_cols = len(self.inputs)
            self.len = len(self.inputs[0])
            self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))
    
        def __len__(self):
            return self.len
    
        def __iter__(self):
            return self
    
        def __next__(self):
            return self.next()
    
        def next(self):
            ids = np.random.randint(0, self.len, (self.batch_size,))
            out = self.inputs[ids, :]
            return [out[:, i] for i in range(self.num_cols)]
    
    
    class OneEpochIterator(ShuffleIterator):
        """
        Sequentially generate one-epoch batches, typically for test data
        """
        def __init__(self, inputs, batch_size=10):
            super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
            if batch_size > 0:
                self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
            else:
                self.idx_group = [np.arange(self.len)]
            self.group_id = 0
    
        def next(self):
            if self.group_id >= len(self.idx_group):
                self.group_id = 0
                raise StopIteration
            out = self.inputs[self.idx_group[self.group_id], :]
            self.group_id += 1
            return [out[:, i] for i in range(self.num_cols)]

    数据的内容主要是关于电影与用户。

    # 导入数据io操作
    from collections import deque
    from six import next
    
    # 调用reader.py
    import readers
    
    # Main imports for training
    import tensorflow as tf
    import numpy as np
    
    # 评估每个轮次的训练时间
    import time
    # 用于复制结果的恒定种子
    np.random.seed(42)
    
    #3900 个电影 6,040个用户
    u_num = 6040 
    i_num = 3952 
    
    batch_size = 1000 
    
    # 数据的维度
    dims = 5    
    
    # 最大迭代轮次
    max_epochs = 50   
    
    # 使用设备
    place_device = "/cpu:0"

    一、加载数据、划分训练集和测试集

    def get_data():
        # 数据依次是用户ID、项目ID、评级、时间戳
        # 样例数据:data - 3::1196::4::978297539
        df = readers.read_file("C:/Users/Administrator/.surprise_data/ml-1m/ratings.dat", sep="::")
        
        # 获取数据的行数,待会儿要做训练和测试集的切分
        rows = len(df)
        
        # 纯粹基于整数位置的索引,根据位置进行选择
        # 实际上就是打乱一下数据的顺序 洗牌
        df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
        
        # 90%用作训练,10%用作测试
        split_index = int(rows * 0.9)
        
        # Use indices to separate the data
        df_train = df[0:split_index]
        df_test = df[split_index:].reset_index(drop=True)
        
        return df_train, df_test
    
    def clip(x):
        return np.clip(x, 1.0, 5.0)

    二、定义模型,返回预测结果和正则化项

    def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
        with tf.device("/cpu:0"):
            # 变量域
            with tf.variable_scope('lsi',reuse=tf.AUTO_REUSE):
                # 全局偏置变量
                # get_variable:在名称前面加上当前变量作用域并执行重用检查
                bias_global = tf.get_variable("bias_global",shape=[])
                
                # 用户的偏好
                w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
                # 电影的偏好
                w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
                
                # 用户和电影一个batch的偏好
                bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
                bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
                
                # 用户和电影的权重
                w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                         initializer=tf.truncated_normal_initializer(stddev=0.02))
                w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                         initializer=tf.truncated_normal_initializer(stddev=0.02))
                
                # 给定批处理的用户和项的权重嵌入
                # 用户和电影一个batch的权重
                embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
                embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
        
        with tf.device(device):
            # 计算张量各维度元素和
            infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
            infer = tf.add(infer, bias_global)
            infer = tf.add(infer, bias_user)
            infer = tf.add(infer, bias_item, name="svd_inference")
            
            # 加上L2的正则化项
            # l2_loss: 计算一个张量的L2范数的一半
            # regularizer:正则化项
            regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                                 name="svd_regularizer")
    
        # 返回我们预测的结果和正则化项
        return infer, regularizer

    三、定义损失函数

    def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
        with tf.device(device):
            # 使用L2 loss算出预测值到实际值的距离 
            # infer  预测值    rate_batch 实际值
            cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
            
            # 惩罚的方式----L2
            penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
            
            # 损失函数 = 数据损失(data loss) + 正则化损失(正则化项 * L2惩罚方式)
            cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
            
            # 训练 使用梯度下降
            train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
        return cost, train_op

    四、读取数据以构建tensorflow模型

    # 从评级文件读取数据以构建 tensorflow 模型
    df_train, df_test = get_data()
    
    samples_per_batch = len(df_train) // batch_size
    print("Number of train samples %d, test samples %d, samples per batch %d" % 
          (len(df_train), len(df_test), samples_per_batch))
    Number of train samples 900188, test samples 100021, samples per batch 900
    # 查看前5个用户值
    print(df_train["user"].head()) 
    print(df_test["user"].head())
    0    1834
    1    5836
    2    1266
    3    2468
    4     117
    Name: user, dtype: int32
    0    5062
    1     251
    2    5831
    3    2243
    4    4903
    Name: user, dtype: int32
    # 查看前5个项目的值
    print(df_train["item"].head())
    print(df_test["item"].head())
    0    1213
    1     995
    2     355
    3    2040
    4    2670
    Name: item, dtype: int32
    0    2917
    1     291
    2    2027
    3    2310
    4    1930
    Name: item, dtype: int32
    # 查看前5个评分值
    print(df_train["rate"].head())
    print(df_test["rate"].head())
    0    5.0
    1    4.0
    2    2.0
    3    5.0
    4    4.0
    Name: rate, dtype: float32
    0    5.0
    1    4.0
    2    4.0
    3    3.0
    4    5.0
    Name: rate, dtype: float32

    五、训练

    # 使用shuffle迭代器生成随机批次,用于训练
    iter_train = readers.ShuffleIterator([df_train["user"],
                                         df_train["item"],
                                         df_train["rate"]],
                                         batch_size=batch_size)
    
    # 按顺序生成一个epoch的batch用于测试
    iter_test = readers.OneEpochIterator([df_test["user"],
                                         df_test["item"],
                                         df_test["rate"]],
                                         batch_size=-1)
    
    # 创建占位符
    user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
    item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
    rate_batch = tf.placeholder(tf.float32, shape=[None])
    
    infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
    _, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)

    六、创建会话

    saver = tf.train.Saver()
    init_op = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init_op)
        print("%s	%s	%s	%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
        errors = deque(maxlen=samples_per_batch)
        start = time.time()
        for i in range(max_epochs * samples_per_batch):
            users, items, rates = next(iter_train)
            _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                   item_batch: items,
                                                                   rate_batch: rates})
            pred_batch = clip(pred_batch)
            errors.append(np.power(pred_batch - rates, 2))
            if i % samples_per_batch == 0:
                train_err = np.sqrt(np.mean(errors))
                test_err2 = np.array([])
                for users, items, rates in iter_test:
                    pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                            item_batch: items})
                    pred_batch = clip(pred_batch)
                    test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
                end = time.time()
                
                print("%02d	%.3f		%.3f		%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
                start = end
    
        saver.save(sess, './save/')
    Epoch	Train Error	Val Error	Elapsed Time
    00	2.782		1.119		0.053 secs
    01	1.046		1.007		0.619 secs
    02	0.981		0.973		0.656 secs
    03	0.955		0.954		0.602 secs
    04	0.941		0.943		0.592 secs
    05	0.931		0.937		0.585 secs
    06	0.926		0.932		0.589 secs
    07	0.921		0.928		0.604 secs
    08	0.917		0.927		0.612 secs
    09	0.916		0.924		0.610 secs
    10	0.914		0.922		0.657 secs
    11	0.910		0.920		0.715 secs
    12	0.909		0.919		0.802 secs
    13	0.909		0.918		0.651 secs
    14	0.907		0.917		0.600 secs
    15	0.907		0.917		0.688 secs
    16	0.906		0.918		0.668 secs
    17	0.905		0.917		0.595 secs
    18	0.903		0.915		0.607 secs
    19	0.905		0.919		0.594 secs
    20	0.903		0.915		0.621 secs
    21	0.903		0.914		0.634 secs
    22	0.902		0.915		0.651 secs
    23	0.903		0.913		0.680 secs
    24	0.902		0.914		0.586 secs
    25	0.902		0.914		0.604 secs
    26	0.901		0.913		0.663 secs
    27	0.902		0.915		0.734 secs
    28	0.901		0.915		0.752 secs
    29	0.901		0.913		0.700 secs
    30	0.900		0.913		0.616 secs
    31	0.900		0.913		0.598 secs
    32	0.900		0.912		0.673 secs
    33	0.901		0.912		0.591 secs
    34	0.900		0.912		0.673 secs
    35	0.899		0.912		0.694 secs
    36	0.899		0.912		0.653 secs
    37	0.898		0.913		0.673 secs
    38	0.899		0.913		0.590 secs
    39	0.900		0.913		0.691 secs
    40	0.899		0.912		0.801 secs
    41	0.899		0.912		1.011 secs
    42	0.899		0.912		0.593 secs
    43	0.899		0.912		0.620 secs
    44	0.900		0.912		0.620 secs
    45	0.899		0.912		0.613 secs
    46	0.899		0.912		0.811 secs
    47	0.899		0.912		0.652 secs
    48	0.899		0.912		0.592 secs
    49	0.899		0.911		0.630 secs
  • 相关阅读:
    LinkButton(按钮)
    清理SharePoint 2010的SQL Server 2008 R2日志数据库的方法
    Sharepoint日志文件增长巨大的解决办法/缩小日志/删除日志
    PDF2SWF简单使用
    SharePoint 2010 网站备份还原简单介绍
    SolidWorks二次开发的研究
    基于VB语言对SolidWorks参数化设计的二次开发
    什么是PDM?
    SharePoint 2010配置PDF文件全文检索
    《博客园精华集--Sharepoint分册》第三轮结果(转)
  • 原文地址:https://www.cnblogs.com/gezhuangzhuang/p/10219332.html
Copyright © 2011-2022 走看看