  • 使用tensorflow构造隐语义模型的推荐系统


    from __future__ import absolute_import, division, print_function
    import numpy as np
    import pandas as pd
    def read_file(filname, sep="	"):
        col_names = ["user", "item", "rate", "st"]
        df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
        df["user"] -= 1
        df["item"] -= 1
        for col in ("user", "item"):
            df[col] = df[col].astype(np.int32)
        df["rate"] = df["rate"].astype(np.float32)
        return df
    class ShuffleIterator(object):
        Randomly generate batches
        def __init__(self, inputs, batch_size=10):
            self.inputs = inputs
            self.batch_size = batch_size
            self.num_cols = len(self.inputs)
            self.len = len(self.inputs[0])
            self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))
        def __len__(self):
            return self.len
        def __iter__(self):
            return self
        def __next__(self):
            return self.next()
        def next(self):
            ids = np.random.randint(0, self.len, (self.batch_size,))
            out = self.inputs[ids, :]
            return [out[:, i] for i in range(self.num_cols)]
    class OneEpochIterator(ShuffleIterator):
        Sequentially generate one-epoch batches, typically for test data
        def __init__(self, inputs, batch_size=10):
            super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
            if batch_size > 0:
                self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
                self.idx_group = [np.arange(self.len)]
            self.group_id = 0
        def next(self):
            if self.group_id >= len(self.idx_group):
                self.group_id = 0
                raise StopIteration
            out = self.inputs[self.idx_group[self.group_id], :]
            self.group_id += 1
            return [out[:, i] for i in range(self.num_cols)]


    # 导入数据io操作
    from collections import deque
    from six import next
    # 调用reader.py
    import readers
    # Main imports for training
    import tensorflow as tf
    import numpy as np
    # 评估每个轮次的训练时间
    import time
    # 用于复制结果的恒定种子
    #3900 个电影 6,040个用户
    u_num = 6040 
    i_num = 3952 
    batch_size = 1000 
    # 数据的维度
    dims = 5    
    # 最大迭代轮次
    max_epochs = 50   
    # 使用设备
    place_device = "/cpu:0"


    def get_data():
        # 数据依次是用户ID、项目ID、评级、时间戳
        # 样例数据:data - 3::1196::4::978297539
        df = readers.read_file("C:/Users/Administrator/.surprise_data/ml-1m/ratings.dat", sep="::")
        # 获取数据的行数,待会儿要做训练和测试集的切分
        rows = len(df)
        # 纯粹基于整数位置的索引,根据位置进行选择
        # 实际上就是打乱一下数据的顺序 洗牌
        df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
        # 90%用作训练,10%用作测试
        split_index = int(rows * 0.9)
        # Use indices to separate the data
        df_train = df[0:split_index]
        df_test = df[split_index:].reset_index(drop=True)
        return df_train, df_test
    def clip(x):
        return np.clip(x, 1.0, 5.0)


    def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
        with tf.device("/cpu:0"):
            # 变量域
            with tf.variable_scope('lsi',reuse=tf.AUTO_REUSE):
                # 全局偏置变量
                # get_variable:在名称前面加上当前变量作用域并执行重用检查
                bias_global = tf.get_variable("bias_global",shape=[])
                # 用户的偏好
                w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
                # 电影的偏好
                w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
                # 用户和电影一个batch的偏好
                bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
                bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
                # 用户和电影的权重
                w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                # 给定批处理的用户和项的权重嵌入
                # 用户和电影一个batch的权重
                embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
                embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
        with tf.device(device):
            # 计算张量各维度元素和
            infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
            infer = tf.add(infer, bias_global)
            infer = tf.add(infer, bias_user)
            infer = tf.add(infer, bias_item, name="svd_inference")
            # 加上L2的正则化项
            # l2_loss: 计算一个张量的L2范数的一半
            # regularizer:正则化项
            regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
        # 返回我们预测的结果和正则化项
        return infer, regularizer


    def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
        with tf.device(device):
            # 使用L2 loss算出预测值到实际值的距离 
            # infer  预测值    rate_batch 实际值
            cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
            # 惩罚的方式----L2
            penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
            # 损失函数 = 数据损失(data loss) + 正则化损失(正则化项 * L2惩罚方式)
            cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
            # 训练 使用梯度下降
            train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
        return cost, train_op


    # 从评级文件读取数据以构建 tensorflow 模型
    df_train, df_test = get_data()
    samples_per_batch = len(df_train) // batch_size
    print("Number of train samples %d, test samples %d, samples per batch %d" % 
          (len(df_train), len(df_test), samples_per_batch))
    Number of train samples 900188, test samples 100021, samples per batch 900
    # 查看前5个用户值
    0    1834
    1    5836
    2    1266
    3    2468
    4     117
    Name: user, dtype: int32
    0    5062
    1     251
    2    5831
    3    2243
    4    4903
    Name: user, dtype: int32
    # 查看前5个项目的值
    0    1213
    1     995
    2     355
    3    2040
    4    2670
    Name: item, dtype: int32
    0    2917
    1     291
    2    2027
    3    2310
    4    1930
    Name: item, dtype: int32
    # 查看前5个评分值
    0    5.0
    1    4.0
    2    2.0
    3    5.0
    4    4.0
    Name: rate, dtype: float32
    0    5.0
    1    4.0
    2    4.0
    3    3.0
    4    5.0
    Name: rate, dtype: float32


    # 使用shuffle迭代器生成随机批次,用于训练
    iter_train = readers.ShuffleIterator([df_train["user"],
    # 按顺序生成一个epoch的batch用于测试
    iter_test = readers.OneEpochIterator([df_test["user"],
    # 创建占位符
    user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
    item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
    rate_batch = tf.placeholder(tf.float32, shape=[None])
    infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
    _, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)


    saver = tf.train.Saver()
    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
        print("%s	%s	%s	%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
        errors = deque(maxlen=samples_per_batch)
        start = time.time()
        for i in range(max_epochs * samples_per_batch):
            users, items, rates = next(iter_train)
            _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                   item_batch: items,
                                                                   rate_batch: rates})
            pred_batch = clip(pred_batch)
            errors.append(np.power(pred_batch - rates, 2))
            if i % samples_per_batch == 0:
                train_err = np.sqrt(np.mean(errors))
                test_err2 = np.array([])
                for users, items, rates in iter_test:
                    pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                            item_batch: items})
                    pred_batch = clip(pred_batch)
                    test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
                end = time.time()
                print("%02d	%.3f		%.3f		%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
                start = end
        saver.save(sess, './save/')
    Epoch	Train Error	Val Error	Elapsed Time
    00	2.782		1.119		0.053 secs
    01	1.046		1.007		0.619 secs
    02	0.981		0.973		0.656 secs
    03	0.955		0.954		0.602 secs
    04	0.941		0.943		0.592 secs
    05	0.931		0.937		0.585 secs
    06	0.926		0.932		0.589 secs
    07	0.921		0.928		0.604 secs
    08	0.917		0.927		0.612 secs
    09	0.916		0.924		0.610 secs
    10	0.914		0.922		0.657 secs
    11	0.910		0.920		0.715 secs
    12	0.909		0.919		0.802 secs
    13	0.909		0.918		0.651 secs
    14	0.907		0.917		0.600 secs
    15	0.907		0.917		0.688 secs
    16	0.906		0.918		0.668 secs
    17	0.905		0.917		0.595 secs
    18	0.903		0.915		0.607 secs
    19	0.905		0.919		0.594 secs
    20	0.903		0.915		0.621 secs
    21	0.903		0.914		0.634 secs
    22	0.902		0.915		0.651 secs
    23	0.903		0.913		0.680 secs
    24	0.902		0.914		0.586 secs
    25	0.902		0.914		0.604 secs
    26	0.901		0.913		0.663 secs
    27	0.902		0.915		0.734 secs
    28	0.901		0.915		0.752 secs
    29	0.901		0.913		0.700 secs
    30	0.900		0.913		0.616 secs
    31	0.900		0.913		0.598 secs
    32	0.900		0.912		0.673 secs
    33	0.901		0.912		0.591 secs
    34	0.900		0.912		0.673 secs
    35	0.899		0.912		0.694 secs
    36	0.899		0.912		0.653 secs
    37	0.898		0.913		0.673 secs
    38	0.899		0.913		0.590 secs
    39	0.900		0.913		0.691 secs
    40	0.899		0.912		0.801 secs
    41	0.899		0.912		1.011 secs
    42	0.899		0.912		0.593 secs
    43	0.899		0.912		0.620 secs
    44	0.900		0.912		0.620 secs
    45	0.899		0.912		0.613 secs
    46	0.899		0.912		0.811 secs
    47	0.899		0.912		0.652 secs
    48	0.899		0.912		0.592 secs
    49	0.899		0.911		0.630 secs
