zoukankan      html  css  js  c++  java
  • Graph Attention Networks (GAT) 代码解读

    Graph Attention Networks (GAT) 代码解读

    1.1 代码结构

    .
    |--- data 		    # Cora数据集
    |--- models		    # GAT模型定义(gat.py)
    |--- pre_trained	# 预训练的模型
    |--- utils		    # 工具定义
    

    1.2 参数设置

    GAT/execute_cora.py

    # training params
    batch_size = 1
    nb_epochs = 100000
    patience = 100
    lr = 0.005  # learning rate
    l2_coef = 0.0005  # weight decay
    hid_units = [8] # numbers of hidden units per each attention head in each layer
    n_heads = [8, 1] # additional entry for the output layer
    residual = False
    nonlinearity = tf.nn.elu
    model = GAT
    

    1.3 导入数据

    GAT源码默认使用的Cora数据集。Cora的相关代码介绍可以参考这里

    数据预处理部分和GCN源码相同,可以参考这里

    最终载入的数据adj为邻接矩阵,表示2708篇文章之间的索引关系。features表示1433个单词在2708篇文章中是否存在。

    GAT/utils/process.py

    def load_data(dataset_str):
        # ...
        print(adj.shape) # (2708, 2708)
        print(features.shape) #(2708, 1433)
    

    1.4 特征预处理

    GAT/utils/process.py

    def preprocess_features(features):
        """Row-normalize feature matrix and convert to tuple representation"""
        rowsum = np.array(features.sum(1))
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv)
    
        features = r_mat_inv.dot(features)
        return features.todense(), sparse_to_tuple(features)
    

    1.5 模型定义-前向传播

    添加一层单头注意力

    '''
    输入是(B,N,D),B是batch size,N是节点数,D是每个节点的原始特征维数
    输出是(B,N,F),F是每个节点的新特征维数
    每个节点从维度D到维度F是按注意力为权重聚合了邻居节点的特征
    '''
    
    def att_head(seq, out_sz, bias_mat, activation, in_drop = 0.0, coef_drop = 0.0, residual = False):
        '''
        seq:输入(B,N,D),B是batch size,N是节点数,D是每个节点的原始特征维数
        out_sz:每个节点的输出特征维数,设为F
        bias_mat:(N,N)掩码矩阵
        activation:激活函数
        in_drop:输入的dropout率
        coef_drop:注意力矩阵的dropout率
        residual:是否使用残差网络
        '''
        
        with tf.name_scope('my_attn'):
            # drop out 防止过拟合;如果为0则不设置该层
            if in_drop != 0.0:
                seq = tf.nn.dropout(seq, 1.0 - in_drop)
            
            '''
            为了获得足够的表达能力以将输入特征转化为高级特征,需要至少一种可学习的线性变换。为此,作为第一步,
            我们学习一个W矩阵用于投影特征
            实现公式seq_fts = Wh,即每个节点的维度变换
            '''
            
            # F2F'
            seq_fts = tf.keras.layers.Conv1D(seq, out_sz, 1, use_bias=False)
    
            '''
            实现公式 f_1 = a(Whi); f_2 = a(Whj)
            f_1+f_2的转置实现了logits = eij = a(Whi) + a(Whj)
            eij经过激活,softmax得到论文中的aij,即点i对点j的注意力
            bias_mat是为了让非互为邻居的注意力不要j进入softmax的计算
            只有互为邻居的注意力才能进入softmax,从而保证了注意力在局部
            '''
            
            # (B, N, F) => (B, N, 1)
            f_1 = tf.keras.layers.Conv1D(seq_fts, 1, 1)
            # (B, N, F) => (B, N, 1)
            f_2 = tf.keras.layers.Conv1D (seq_fts, 1, 1)
            
            # (B, N, 1) + (B, N, 1) = (B, N, N)
            # logits 即 eij
            logits = f_1 + tf.transpose(f_2, [0, 2, 1])
            # (B, N, N) + (1, N, N) => (B, N, N) => softmax => (B, N, N)
            # 这里运用了 tensorflow 的广播机制
            # 得到的logits 并不是一个对角矩阵, 这是因为 f_1 和 f_2并非同一个参数 a
            # logits{i,j} 等于 a1(Whi) + a2(Whj)
            
            # 注意力系数矩阵coefs=(aij)_{N*N}
            # bias_mat 体现 mask 思想, 保留了图的结构信息, 
    
            
            coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
            
            # 输入矩阵、注意力系数矩阵的dropout操作
            if coef_drop != 0.0:
                coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
            if in_drop != 0.0:
                seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
                
            '''
            实现 hi = sum(aijWhj)
            即节点i根据注意力聚合邻居特征
            '''
            
            # (B, N, N) * (B, N, F) => (B, N, F)
            vals = tf.matmul(coefs, seq_fts)
            
            
            
            # 添加偏置项
            ret = tf.contrib.layers.bias_add(vals)
            
            '''
            添加残差连接后,激活
            如果输入(B, N, D)和聚合了节点特征的输出(B, N, F)的最后一个维度相同,则直接相加
            否则将(B, N, D)线性变换为(B, N, F) 再相加
            '''
            
            # residual connection
            if residual:
                # D != F
                if seq.shape[-1] != ret.shape[-1]:
                    ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
                else:
                    ret = ret + seq
            
            return activation(ret) # activation
    

    模型定义

    class BaseGAttN:
        def loss(logits, labels, nb_classes, class_weights):
            sample_wts = tf.reduce_sum(tf.multiply(tf.one_hot(labels, nb_classes), class_weights), axis=-1)
            xentropy = tf.multiply(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels, logits=logits), sample_wts)
            return tf.reduce_mean(xentropy, name='xentropy_mean')
        
        def training(loss, lr, l2_coef):
            # weight decay
            vars = tf.trainable_variables()
            lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not in ['bias', 'gamma', 'b', 'g', 'beta']] * l2_coef)
            
            # optimizer 
            opt = tf.train.AdamOptimizer(learning_rate = lr)
            
            # training op
            train_op = opt.minimize(loss + lossL2)
            
            return train_op
        
        
        def masked_softmax_cross_entropy(logits, labels, mask):
            '''
            Softmax cross-entropy loss with masking.
            logits: 模型的输出,维度(B, C); B是样本量, C是输出维度
            labels: 模型的标签,维度(B, C)
            mask: 掩码,维度(B, )
            '''
            
            # logits 先用softmax转化为概率分布,再和labelsj计算交叉熵
            # loss 维度是(B,)
            loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels)
            
            # 将数据类型转化为 tf.float32
            mask = tf.cast(mask, dtype = tf.float32)
            
            # 将mask值归一化
            mask /= tf.reduce_mean(mask)
            
            # 屏蔽掉某些样本的损失
            loss *= mask
            
            # 返回均值损失
            return tf.reduce_mean(loss)
        
        
        def masked_sigmoid_cross_entropy(logits, labels, mask):
            '''
            Softmax cross-entropy loss with masking.
            logits:(B, C), 模型输出; B是样本量,C是输出维度
            labels:(B, C), 真实标签
            mask: 掩码,维度(B,)
            '''
            labels = tf.cast(mask, dtype = tf.float32)
            # loss 维度是(B,)
            loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels)
            # (B,C) =>(B,)
            loss = tf.reduce_mean(loss, axis = 1)
            
            mask /= tf.reduce_mean(mask)
            loss *= mask
            
            return tf.reduce_mean(loss)
        
        def masked_accuracy(logits, labels, mask):
            '''
            Accuracy with masking
            logits:(B, C), 模型输出; B是样本量, C是输出维度
            labels:(B, C), 真实标签
            mask: 掩码,维度(B,)
            '''
            
            # 计算预测值和真实值的索引相同,则预测正确
            correct_prediction = tf.equal( tf.argmax(logits, 1), tf.argmax(labels, 1) )
            accuracy_all = tf.cast( correct_prediction, tf.float32 )
            mask = tf.cast( mask, dtype = tf.float32 )
            mask /= tf.reduce_mean(mask)
            accuracy_all *= mask
            return tf.reduce_mean(accuracy_all)
        
        
    #%%
    class GAT(BaseGAttN):
        
        def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop, bias_mat,
                      hid_mat, hid_units, n_heads, activation = tf.nn.elu, residual = False):
            '''
            inputs:(B,N,D), B是batch size, N是节点数, D是每个节点的原始特征维数
            nb_classes: 分类任务的类别数, 设为C
            nb_nodes: 节点个数,设为N
            training: 标志'训练阶段', '测试阶段'
            attn_drop: 注意力矩阵dropout率,防止过拟合
            ffd_drop: 输入的dropout率,防止过拟合
            bias_mat: 一个(N, N)矩阵,由邻接矩阵A变化而来,是注意力矩阵的掩码
            hid_units: 列表, 第i个元素是第i层的每个注意力头的隐藏单元数
            n_heads: 列表, 第i个元素是第i层的注意力头数
            activation: 激活函数
            resudial: 是否采用残差连接
            '''
            
            
            '''
            第一层,由H1个注意力头,每个头的输入都是(B, N, D), 每个头的注意力输出都是(B, N, F1)
            将所有注意力头的输出聚合, 聚合为(B, N, F1*H1)
            '''
            attns = []
            # n_heads[0] = 第一层注意力头数, 设为 H1
            for i in range(n_heads[0]):
                attns.append(
                        attn_head(inputs, bias_mat = bias_mat, 
                                  out_sz = hid_units[0], activation = activatoin,
                                  in_drop = ffd_drop, coef_drop = attn_drop, residual = False)
                        ) 
                        
            # [(B, N, F1), (B, N, F1)..] => (B, N, F1 * H1)
            
            h_1 = tf.concat(attns, axis = -1) # 连接上一层
            
            '''
            中间层,层数是 len(hid_units)-1;
            第i层有Hi个注意力头,输入是(B, N, F1*H1),每头注意力输出是(B, N, F1);
            每层均聚合所有头的注意力, 得到(B, N, Fi * Hi)
            '''
            # len(hid_units) = 中间层的个数
            for i in range(1, len(hid_units)):
                h_old = h_1 # 未使用
                attns = []
                # n_heads[i] = 中间第i层的注意力头数,设为Hi
                for _ in range(n_heads[i]):
                    attns.append(
                            attn_head(h_1, bias_mat = bias_mat,
                                      out_sz = hid_units[i], activation = activation,
                                      in_drop = ffd_drop, coef_drop = attn_drop, residual = residual)
                            )
                
                # [(B, N, Fi), (B, N, Fi) ..] => (B, N, Fi*Hi)
                h_1 = tf.concat(attns, axis = -1) # 连接上一层
            
            '''
            最后一层,共有n_heads[-1]个注意力,一般为1
            输入: 最后一层的输出为(B, N, Fi*Hi)
            输出: (B, N, C), C是分类任务数
            输出:
            '''
            
            
            out = []
            for i in range(n_heads[-1]):
                out.append(
                        attn_head(h_1, bias_mat = bias_mat, 
                                  out_sz = nb_classes, activation = lambda x : x,
                                  in_  drop = ffd_drop, coef_drop = attn_drop, residual = False   )
                        )
            
            # 将多头注意力相加取平均
            logits = tf.add_n(out) / n_heads[-1]
            
            return logits
        
    
    ---- suffer now and live the rest of your life as a champion ----
  • 相关阅读:
    127.0.0.1(转) Anny
    轮岗开发后再看测试(转) Anny
    如何做好功能测试的方法(转) Anny
    Search Framework: Search Result checklist(转) Anny
    What is a Private IP Address(转) Anny
    Private IP Addresses(转) Anny
    公共模式资源库链接 Anny
    What is Dynamic DNS? Anny
    随机数产生
    tomcat源码阅读_代码篇4
  • 原文地址:https://www.cnblogs.com/popodynasty/p/14551992.html
Copyright © 2011-2022 走看看