Source code for openhgnn.layers.micro_layer.HGConv

import torch
from torch import nn
import torch.nn.functional as F

import dgl
from dgl.nn.pytorch.softmax import edge_softmax
import dgl.function as fn



[docs]
class AttConv(nn.Module):
    """
    Attention-based convolution was introduced in `Hybrid Micro/Macro Level Convolution for Heterogeneous Graph Learning
    <https://arxiv.org/abs/>`__
    and mathematically is defined as follows:

    """

    def __init__(self, in_feats: tuple, 
                 out_feats: int, 
                 num_heads: int, 
                 dropout: float = 0.0, 
                 negative_slope: float = 0.2):
        """
        Parameters
        ----------
        in_feats : pair of ints
            Input feature size.
        out_feats : int
            Output feature size.
        num_heads : int
            Number of heads in Multi-Head Attention.
        dropout : float, optional
            Dropout rate, defaults: 0.
        negative_slope : float, optional
            Negative slope rate, defaults: 0.2.
        """
        super(AttConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = in_feats[0], in_feats[1]
        self._out_feats = out_feats
        self._num_heads = num_heads

        self.dropout = nn.Dropout(dropout)
        self.leaky_relu = nn.LeakyReLU(negative_slope)

    def forward(self,
                graph: dgl.DGLHeteroGraph,
                feat: tuple,
                dst_node_transformation_weight: nn.Parameter,
                src_node_transformation_weight: nn.Parameter,
                src_nodes_attention_weight: nn.Parameter):
        r"""Compute graph attention network layer.
        
        Parameters
        ----------
        graph: 
            specific relational DGLHeteroGraph
        feat: pair of torch.Tensor
            The pair contains two tensors of shape (N_{in}, D_{in_{src}})` and (N_{out}, D_{in_{dst}}).
        dst_node_transformation_weight: 
            Parameter (input_dst_dim, n_heads * hidden_dim)
        src_node_transformation_weight: 
            Parameter (input_src_dim, n_heads * hidden_dim)
        src_nodes_attention_weight: 
            Parameter (n_heads, 2 * hidden_dim)

        Returns
        -------
        torch.Tensor, shape (N, H, D_out)` where H is the number of heads, and D_out is size of output feature.
        """
        with graph.local_scope():
            # Tensor, (N_src, input_src_dim)
            feat_src = self.dropout(feat[0])
            # Tensor, (N_dst, input_dst_dim)
            feat_dst = self.dropout(feat[1])
            # Tensor, (N_src, n_heads, hidden_dim) -> (N_src, input_src_dim) * (input_src_dim, n_heads * hidden_dim)
            feat_src = torch.matmul(feat_src, src_node_transformation_weight).view(-1, self._num_heads, self._out_feats)
            # Tensor, (N_dst, n_heads, hidden_dim) -> (N_dst, input_dst_dim) * (input_dst_dim, n_heads * hidden_dim)
            feat_dst = torch.matmul(feat_dst, dst_node_transformation_weight).view(-1, self._num_heads, self._out_feats)

            # first decompose the weight vector into [a_l || a_r], then
            # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j, This implementation is much efficient
            # Tensor, (N_dst, n_heads, 1),   (N_dst, n_heads, hidden_dim) * (n_heads, hidden_dim)
            e_dst = (feat_dst * src_nodes_attention_weight[:, :self._out_feats]).sum(dim=-1, keepdim=True)
            # Tensor, (N_src, n_heads, 1),   (N_src, n_heads, hidden_dim) * (n_heads, hidden_dim)
            e_src = (feat_src * src_nodes_attention_weight[:, self._out_feats:]).sum(dim=-1, keepdim=True)
            # (N_src, n_heads, hidden_dim), (N_src, n_heads, 1)
            graph.srcdata.update({'ft': feat_src, 'e_src': e_src})
            # (N_dst, n_heads, 1)
            graph.dstdata.update({'e_dst': e_dst})
            # compute edge attention, e_src and e_dst are a_src * Wh_src and a_dst * Wh_dst respectively.
            graph.apply_edges(fn.u_add_v('e_src', 'e_dst', 'e'))
            # shape (edges_num, heads, 1)
            e = self.leaky_relu(graph.edata.pop('e'))

            # compute softmax
            graph.edata['a'] = edge_softmax(graph, e)

            graph.update_all(fn.u_mul_e('ft', 'a', 'msg'), fn.sum('msg', 'ft'))
            # (N_dst, n_heads * hidden_dim),   (N_dst, n_heads, hidden_dim) reshape
            dst_features = graph.dstdata.pop('ft').reshape(-1, self._num_heads * self._out_feats)

            dst_features = F.relu(dst_features)

        return dst_features