先创建一个reader.py,后面的程序将用到其中的函数。
from __future__ import absolute_import, division, print_function import numpy as np import pandas as pd def read_file(filname, sep=" "): col_names = ["user", "item", "rate", "st"] df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python') df["user"] -= 1 df["item"] -= 1 for col in ("user", "item"): df[col] = df[col].astype(np.int32) df["rate"] = df["rate"].astype(np.float32) return df class ShuffleIterator(object): """ Randomly generate batches """ def __init__(self, inputs, batch_size=10): self.inputs = inputs self.batch_size = batch_size self.num_cols = len(self.inputs) self.len = len(self.inputs[0]) self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)])) def __len__(self): return self.len def __iter__(self): return self def __next__(self): return self.next() def next(self): ids = np.random.randint(0, self.len, (self.batch_size,)) out = self.inputs[ids, :] return [out[:, i] for i in range(self.num_cols)] class OneEpochIterator(ShuffleIterator): """ Sequentially generate one-epoch batches, typically for test data """ def __init__(self, inputs, batch_size=10): super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size) if batch_size > 0: self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size)) else: self.idx_group = [np.arange(self.len)] self.group_id = 0 def next(self): if self.group_id >= len(self.idx_group): self.group_id = 0 raise StopIteration out = self.inputs[self.idx_group[self.group_id], :] self.group_id += 1 return [out[:, i] for i in range(self.num_cols)]
数据的内容主要是关于电影与用户。
# 导入数据io操作 from collections import deque from six import next # 调用reader.py import readers # Main imports for training import tensorflow as tf import numpy as np # 评估每个轮次的训练时间 import time
# 用于复制结果的恒定种子 np.random.seed(42) #3900 个电影 6,040个用户 u_num = 6040 i_num = 3952 batch_size = 1000 # 数据的维度 dims = 5 # 最大迭代轮次 max_epochs = 50 # 使用设备 place_device = "/cpu:0"
一、加载数据、划分训练集和测试集
def get_data(): # 数据依次是用户ID、项目ID、评级、时间戳 # 样例数据:data - 3::1196::4::978297539 df = readers.read_file("C:/Users/Administrator/.surprise_data/ml-1m/ratings.dat", sep="::") # 获取数据的行数,待会儿要做训练和测试集的切分 rows = len(df) # 纯粹基于整数位置的索引,根据位置进行选择 # 实际上就是打乱一下数据的顺序 洗牌 df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) # 90%用作训练,10%用作测试 split_index = int(rows * 0.9) # Use indices to separate the data df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) return df_train, df_test def clip(x): return np.clip(x, 1.0, 5.0)
二、定义模型,返回预测结果和正则化项
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"): with tf.device("/cpu:0"): # 变量域 with tf.variable_scope('lsi',reuse=tf.AUTO_REUSE): # 全局偏置变量 # get_variable:在名称前面加上当前变量作用域并执行重用检查 bias_global = tf.get_variable("bias_global",shape=[]) # 用户的偏好 w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num]) # 电影的偏好 w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num]) # 用户和电影一个batch的偏好 bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user") bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item") # 用户和电影的权重 w_user = tf.get_variable("embd_user", shape=[user_num, dim], initializer=tf.truncated_normal_initializer(stddev=0.02)) w_item = tf.get_variable("embd_item", shape=[item_num, dim], initializer=tf.truncated_normal_initializer(stddev=0.02)) # 给定批处理的用户和项的权重嵌入 # 用户和电影一个batch的权重 embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user") embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item") with tf.device(device): # 计算张量各维度元素和 infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1) infer = tf.add(infer, bias_global) infer = tf.add(infer, bias_user) infer = tf.add(infer, bias_item, name="svd_inference") # 加上L2的正则化项 # l2_loss: 计算一个张量的L2范数的一半 # regularizer:正则化项 regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name="svd_regularizer") # 返回我们预测的结果和正则化项 return infer, regularizer
三、定义损失函数
def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"): with tf.device(device): # 使用L2 loss算出预测值到实际值的距离 # infer 预测值 rate_batch 实际值 cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch)) # 惩罚的方式----L2 penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2") # 损失函数 = 数据损失(data loss) + 正则化损失(正则化项 * L2惩罚方式) cost = tf.add(cost_l2, tf.multiply(regularizer, penalty)) # 训练 使用梯度下降 train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) return cost, train_op
四、读取数据以构建tensorflow模型
# 从评级文件读取数据以构建 tensorflow 模型 df_train, df_test = get_data() samples_per_batch = len(df_train) // batch_size print("Number of train samples %d, test samples %d, samples per batch %d" % (len(df_train), len(df_test), samples_per_batch))
Number of train samples 900188, test samples 100021, samples per batch 900
# 查看前5个用户值 print(df_train["user"].head()) print(df_test["user"].head())
0 1834 1 5836 2 1266 3 2468 4 117 Name: user, dtype: int32 0 5062 1 251 2 5831 3 2243 4 4903 Name: user, dtype: int32
# 查看前5个项目的值 print(df_train["item"].head()) print(df_test["item"].head())
0 1213 1 995 2 355 3 2040 4 2670 Name: item, dtype: int32 0 2917 1 291 2 2027 3 2310 4 1930 Name: item, dtype: int32
# 查看前5个评分值 print(df_train["rate"].head()) print(df_test["rate"].head())
0 5.0 1 4.0 2 2.0 3 5.0 4 4.0 Name: rate, dtype: float32 0 5.0 1 4.0 2 4.0 3 3.0 4 5.0 Name: rate, dtype: float32
五、训练
# 使用shuffle迭代器生成随机批次,用于训练 iter_train = readers.ShuffleIterator([df_train["user"], df_train["item"], df_train["rate"]], batch_size=batch_size) # 按顺序生成一个epoch的batch用于测试 iter_test = readers.OneEpochIterator([df_test["user"], df_test["item"], df_test["rate"]], batch_size=-1) # 创建占位符 user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user") item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item") rate_batch = tf.placeholder(tf.float32, shape=[None]) infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device) _, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)
六、创建会话
saver = tf.train.Saver() init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) print("%s %s %s %s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time")) errors = deque(maxlen=samples_per_batch) start = time.time() for i in range(max_epochs * samples_per_batch): users, items, rates = next(iter_train) _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users, item_batch: items, rate_batch: rates}) pred_batch = clip(pred_batch) errors.append(np.power(pred_batch - rates, 2)) if i % samples_per_batch == 0: train_err = np.sqrt(np.mean(errors)) test_err2 = np.array([]) for users, items, rates in iter_test: pred_batch = sess.run(infer, feed_dict={user_batch: users, item_batch: items}) pred_batch = clip(pred_batch) test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2)) end = time.time() print("%02d %.3f %.3f %.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start)) start = end saver.save(sess, './save/')
Epoch Train Error Val Error Elapsed Time 00 2.782 1.119 0.053 secs 01 1.046 1.007 0.619 secs 02 0.981 0.973 0.656 secs 03 0.955 0.954 0.602 secs 04 0.941 0.943 0.592 secs 05 0.931 0.937 0.585 secs 06 0.926 0.932 0.589 secs 07 0.921 0.928 0.604 secs 08 0.917 0.927 0.612 secs 09 0.916 0.924 0.610 secs 10 0.914 0.922 0.657 secs 11 0.910 0.920 0.715 secs 12 0.909 0.919 0.802 secs 13 0.909 0.918 0.651 secs 14 0.907 0.917 0.600 secs 15 0.907 0.917 0.688 secs 16 0.906 0.918 0.668 secs 17 0.905 0.917 0.595 secs 18 0.903 0.915 0.607 secs 19 0.905 0.919 0.594 secs 20 0.903 0.915 0.621 secs 21 0.903 0.914 0.634 secs 22 0.902 0.915 0.651 secs 23 0.903 0.913 0.680 secs 24 0.902 0.914 0.586 secs 25 0.902 0.914 0.604 secs 26 0.901 0.913 0.663 secs 27 0.902 0.915 0.734 secs 28 0.901 0.915 0.752 secs 29 0.901 0.913 0.700 secs 30 0.900 0.913 0.616 secs 31 0.900 0.913 0.598 secs 32 0.900 0.912 0.673 secs 33 0.901 0.912 0.591 secs 34 0.900 0.912 0.673 secs 35 0.899 0.912 0.694 secs 36 0.899 0.912 0.653 secs 37 0.898 0.913 0.673 secs 38 0.899 0.913 0.590 secs 39 0.900 0.913 0.691 secs 40 0.899 0.912 0.801 secs 41 0.899 0.912 1.011 secs 42 0.899 0.912 0.593 secs 43 0.899 0.912 0.620 secs 44 0.900 0.912 0.620 secs 45 0.899 0.912 0.613 secs 46 0.899 0.912 0.811 secs 47 0.899 0.912 0.652 secs 48 0.899 0.912 0.592 secs 49 0.899 0.911 0.630 secs