zoukankan      html  css  js  c++  java
  • TensorFlow实现FM

    看了网上的一些用tf实现的FM,很多都没有考虑FM实际使用中数据样本稀疏的问题。

    我在实现的时候使用 embedding_lookup_sparse来解决这个问题。

    对于二阶部分,由于embedding_lookup_sparse没法计算 和的平方 和 平方的和,我参考embedding_lookup_sparse中sum和mean两种实现,自己写了一下。不过数据输入部分还需要改一下,改用dataset会更好。

    代码如下:

    import tensorflow as tf
    from tensorflow.python.ops import math_ops
    from tensorflow.python.framework import dtypes
    from tensorflow.python.ops import array_ops
    import random
    import numpy as np
    from sklearn import metrics
    
    class Args():
        feature_size=925
        field_size=15
        embedding_size = 20
        epoch = 3
        batch_size = 2000
        learning_rate = 0.001
        l2_reg_rate = 0.001
        checkpoint_dir = "./model"
        is_training = True
    
    class FMmodel():
        def __init__(self):
            self.feature_sizes = Args.feature_size
            self.field_size = Args.field_size
            self.embedding_size = Args.embedding_size
            self.l2_reg_rate = Args.l2_reg_rate
            self.epoch = Args.epoch
            self.learning_rate = Args.learning_rate
            self.weight = {}
            self.model_path = Args.checkpoint_dir
            self.batch_size = Args.batch_size
    
        def build_model(self,is_warm_up=False):
            self.x1_index = tf.sparse_placeholder(tf.int64,name="x1_index")
            self.x1_value = tf.sparse_placeholder(tf.float32,name="x1_value")
            self.labels = tf.placeholder(tf.float32,name="labels",shape=[None,1])
            init_randomW = tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None, dtype=tf.float32)
            init_randomV = tf.random_normal_initializer(mean=0.0, stddev=0.00001, seed=None, dtype=tf.float32)
            #特征向量
            self.weight["feature_weight"] = tf.get_variable(
                shape =[self.feature_sizes,self.embedding_size],
                name='feature_weight',
                initializer=init_randomV
            )
    
            #一次项中的W系数
            self.weight["feature_first"] = tf.get_variable(
                shape=[self.feature_sizes,1],
                initializer=init_randomW,
                name='feature_first')
    
            self.weight["bais"] = tf.get_variable(shape=[1,1],initializer=tf.constant_initializer(0.0),name="bais")
    
            #[batch_size,1] 线性部分的计算结果 xi*wi求和
            self.line_part1 = tf.nn.embedding_lookup_sparse(self.weight["feature_first"],
                                                            sp_ids=self.x1_index,sp_weights=self.x1_value,combiner='sum')
            self.line_part1_shape = tf.shape(self.line_part1)
            #[batch*embedding_size]
            self.embedding_part1_sum_square = tf.nn.embedding_lookup_sparse(self.weight["feature_weight"],
                                                          sp_ids=self.x1_index,sp_weights=self.x1_value,combiner='sum')
    
            #[batch_size,embeding_size]
            ids_1 = self.x1_index.values
    
            self.ids1,self.idx1 = tf.unique(ids_1)
    
            self.weight_1 = self.x1_value.values
    
            self.weight_1 = tf.reshape(self.weight_1,[-1,1])
    
            if self.weight_1.dtype != dtypes.float32:
                self.weight_1 = math_ops.cast(self.weight_1,dtypes.float32)
    
            #[batch_size,embedding_size]
            self.embedding_1 = tf.nn.embedding_lookup(self.weight["feature_weight"],ids=self.ids1)
    
            self.new_embedding_1 = tf.gather(self.embedding_1,self.idx1)
    
            #[batch_value_count,embedding_size]
            self.embedding_weight_part1 =tf.multiply(self.weight_1,self.new_embedding_1)
    
            self.embedding_weight_part1_square = tf.square(self.embedding_weight_part1)
    
    
            self.segment_ids_1 = self.x1_index.indices[:, 0]
    
            if self.segment_ids_1.dtype != dtypes.int32:
                self.segment_ids_1 = math_ops.cast(self.segment_ids_1, dtypes.int32)
    
            self.embeddings_square_sum1 = tf.math.segment_sum(
                self.embedding_weight_part1_square,self.segment_ids_1)
    
            self.ess1_shape = tf.shape(self.embeddings_square_sum1)
            #[batch_size,1]
            self.y1_v = 0.5*tf.reduce_sum(tf.subtract(self.embedding_part1_sum_square,self.embeddings_square_sum1),1)
            self.y1_v = tf.reshape(self.y1_v,[-1,1])
            self.y1 = tf.add(tf.add(self.line_part1,self.y1_v),self.weight["bais"])
    
            self.o1 = tf.sigmoid(self.y1)
            self.loss = tf.losses.log_loss(labels=self.labels,predictions=self.o1)
            self.error = tf.reduce_mean(self.loss)
            # with tf.name_scope("loss"):
            #     tf.summary.scalar("loss", self.error)
    
            self.opt = tf.train.AdamOptimizer().minimize(self.error)
            self.session = tf.Session()
            self.init = tf.group(tf.global_variables_initializer())
            if is_warm_up:
                self.saver = tf.train.Saver(tf.global_variables())
                self.saver.restore(self.session, self.model_path)
            else:
                self.session.run(self.init)
    
        def predict(self,file_name):
            result_list = []
            for x1_index, x1_value, true_labels in self.load_data(file_name,is_train=False):
                predict1 = self.session.run([self.o1],feed_dict={
                    self.x1_value:x1_value,
                    self.x1_index:x1_index
                })
                # print(len(predict1))
                # print(len(predict1[0]))
                # print(true_labels.shape)
                for i in range(len(predict1[0])):
                    result_list.append((true_labels[i][0],predict1[0][i]))
                print(len(result_list))
            with open("./data/result.txt",'w') as file1:
                for tp in result_list:
                    file1.write(str(tp[0])+","+str(tp[1][0])+"
    ")
    
        def save(self,sess,path):
            saver = tf.train.Saver()
            saver.save(sess,save_path=path)
    
        def restore(self,sess,path):
            saver = tf.train.Saver()
            saver.restore(sess,save_path=path)
    
        def train(self,train_data_file):
            index=0
            for x1_index,x1_value,true_labels in self.load_data(train_data_file):#ids_1,ids_2,weight_1,weight_2,
                if(len(true_labels)<2):
                    #print("###$$$$$$ : "+str(len(true_labels)))
                    continue
                my_o1,myerror,_=self.session.run([self.o1,self.error,self.opt],feed_dict={
                    self.x1_index : x1_index,
                    self.x1_value : x1_value,
                    self.labels:true_labels
                })
                index+=1
                # if(index%1000==0):
                #     for i in range(len(my_o1)):
                #         print(str(my_o1[i])+" : "+str(true_labels[i]))
                #y_t = true_labels.reshape([-1])
                #y_p = np.asarray(my_o1,dtype=float).reshape([-1])
                print(metrics.roc_auc_score(true_labels,my_o1))
    
                #print(my_o1)
    
            self.save(self.session,self.model_path)
    
            self.session.close()
    
        def load_data(self,file_name,epoch=3,is_train=True):
            def __parse_line(line):
                tokens = line.split("#")[0].split()
                assert len(tokens)>=2, "Ill-formatted line: {}".format(line)
                label = float(tokens[0])
                uid = tokens[1]
                mid = tokens[2]
                kv_pairs = [kv.split(":") for kv in tokens[3:]]
                features = {k: float(v) for (k,v) in kv_pairs}
                #print(type(features))
                qid = uid
                return qid,features,label
    
            def __encoder_line(sample):
                qid = sample[0]
                features = sample[1]
                label = sample[2]
                features_arr = []
                for key in features.keys():
                    features_arr.append(str(key)+":"+str(features[key]))
                return str(label)+" "+"qid:"+str(qid)+" "+" ".join(features_arr)
    
            def __gen_sparse_tensor(sample_list):
                # 生成batch_size数据
                # 根据sample_pair_list生成一个batch_size的训练样本
                sample_index = 0
                tensor_x1_index_ids = []
                tensor_x1_index_value = []
    
                tensor_x1_value_ids = []
                tensor_x1_value_values = []
                label_list = []
                for sample in sample_list:
                    x1_feature = sample[0]
                    label_list.append([float(sample[1])])
                    tmpIndex = 0
                    for key in x1_feature.keys():
                        tensor_x1_index_ids.append([sample_index, tmpIndex])
                        tensor_x1_index_value.append(int(key))
    
                        tensor_x1_value_ids.append([sample_index, tmpIndex])
                        tensor_x1_value_values.append(float(x1_feature[key]))
                        tmpIndex += 1
                    sample_index+=1
                x1_index = tf.SparseTensorValue(indices=tensor_x1_index_ids,values=tensor_x1_index_value,
                                           dense_shape=[len(sample_list),self.feature_sizes])
                x1_value = tf.SparseTensorValue(indices=tensor_x1_value_ids,values=tensor_x1_value_values,
                                           dense_shape=[len(sample_list),self.feature_sizes])
                #print("AHAHAHAHA : "+str(len(sample_list)))
                return x1_index,x1_value,np.asarray(label_list,dtype=np.float32)
    
            def __gen_train_data(file_name):
                new_file_name  = file_name+"_train_data"
                with open(file_name,'r') as filer:
                    with open(new_file_name,'w') as filew:
                        sample_list = []
                        now_qid = None
                        for l in filer:
                            qid, features, label = __parse_line(l)
                            if now_qid is None or now_qid==qid:
                                now_qid = qid
                                sample_list.append((qid,features,label))
                            else:
                                sorted_sample_list = sorted(sample_list,key=lambda x:x[2],reverse=True)
                                for sample in sorted_sample_list:
                                    sample_str = __encoder_line(sample)
                                    filew.write(sample_str+"
    ")
                                sample_list = []
                                now_qid = qid
                                sample_list.append((qid, features, label))
    
                return new_file_name
    
            if is_train:
                new_file_name ="./data/new_final_train_data.txt" # __gen_train_data(file_name)
                print("process data")
                sample_list = []
                while epoch>0:
                    epoch-=1
                    with open(new_file_name,'r') as filer:
                        for l in filer:
                            qid,features,label = __parse_line(l)
                            #print(len(sample_list))
                            if len(sample_list)<self.batch_size*10:
                                sample_list.append((features,label))
                            else:
                                random.shuffle(sample_list)
                                start = 0
                                end = len(sample_list)
                                while (start < end):
                                    tmpEnd = min(end, start + self.batch_size)
                                    sub_list = sample_list[start:tmpEnd]
                                    x1_index, x1_value,labels = __gen_sparse_tensor(sub_list)  # ids_1,ids_2,weight_1,weight_2,
                                    if(labels.sum()<1):
                                        start += self.batch_size
                                        continue
                                    yield (x1_index, x1_value,labels)  # ids_1,ids_2,weight_1,weight_2,
                                    start += self.batch_size
                                sample_list = []
                                sample_list.append((features, label))
            else:
                with open(file_name, 'r') as filer:
                    sample_list = []
                    for l in filer:
                        qid, features, label = __parse_line(l)
                        # print(len(sample_list))
                        if len(sample_list) < self.batch_size:
                            sample_list.append((features, label))
                        else:
                            start = 0
                            end = len(sample_list)
                            while (start < end):
                                tmpEnd = min(end, start + self.batch_size)
                                sub_list = sample_list[start:tmpEnd]
                                x1_index, x1_value, labels = __gen_sparse_tensor(sub_list)  # ids_1,ids_2,weight_1,weight_2,
                                yield (x1_index, x1_value, labels)  # ids_1,ids_2,weight_1,weight_2,
                                start += self.batch_size
                            sample_list = []
                            sample_list.append((features, label))
    
    
    if __name__ =="__main__":
        fm = FMmodel()
        fm.build_model(is_warm_up=True)
        #fm.train("./data/new_final_train_data.txt")
        fm.predict("./data/test.data")
  • 相关阅读:
    AI听说读写
    C#文件夹检测与创建
    全球唯一标识符 System.Guid.NewGuid().ToString();
    mssql不允许删表方法
    SQLServer(MSSQL)、MySQL、SQLite、Access相互迁移转换工具
    sql语句精剪数据长度函数substring(string,int,int)
    代码生成工具
    [Java123] POI CellType 数据类型
    FRM二级备考感想
    【Java123】XML与JSON互相转化
  • 原文地址:https://www.cnblogs.com/earendil/p/10772972.html
Copyright © 2011-2022 走看看