import dgl
import math
import random
import numpy as np
import torch as th
from dgl.data.knowledge_graph import load_data
from . import BaseDataset, register_dataset
from . import AcademicDataset, HGBDataset, OHGBDataset
from ..utils import add_reverse_edges
__all__ = ['LinkPredictionDataset', 'HGB_LinkPrediction']
[文档]@register_dataset('link_prediction')
class LinkPredictionDataset(BaseDataset):
"""
metric: Accuracy, multi-label f1 or multi-class f1. Default: `accuracy`
"""
def __init__(self, *args, **kwargs):
super(LinkPredictionDataset, self).__init__(*args, **kwargs)
self.target_link = None
self.target_link_r = None
def get_split(self, val_ratio=0.1, test_ratio=0.2):
"""
Get subgraphs for train, valid and test.
Generally, the original will have train_mask and test_mask in edata, or we will split it automatically.
If the original graph do not has the train_mask in edata, we default that there is no valid_mask and test_mask.
So we will split the edges of the original graph into train/valid/test 0.7/0.1/0.2.
The dataset has not validation_mask, so we split train edges randomly.
Parameters
----------
val_ratio : int
The ratio of validation. Default: 0.1
test_ratio : int
The ratio of test. Default: 0.2
Returns
-------
train_hg
"""
val_edge_dict = {}
test_edge_dict = {}
out_ntypes = []
train_graph = self.g
for i, etype in enumerate(self.target_link):
num_edges = self.g.num_edges(etype)
if 'train_mask' not in self.g.edges[etype].data:
"""
split edges into train/valid/test.
"""
random_int = th.randperm(num_edges)
val_index = random_int[:int(num_edges * val_ratio)]
val_edge = self.g.find_edges(val_index, etype)
test_index = random_int[int(num_edges * val_ratio):int(num_edges * (test_ratio + val_ratio))]
test_edge = self.g.find_edges(test_index, etype)
val_edge_dict[etype] = val_edge
test_edge_dict[etype] = test_edge
out_ntypes.append(etype[0])
out_ntypes.append(etype[2])
train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), etype)
# train_graph = dgl.remove_edges(train_graph, val_index, etype)
if self.target_link_r is None:
pass
else:
reverse_edge = self.target_link_r[i]
train_graph = dgl.remove_edges(train_graph, th.arange(train_graph.num_edges(reverse_edge)),
reverse_edge)
edges = train_graph.edges(etype=etype)
train_graph = dgl.add_edges(train_graph, edges[1], edges[0], etype=reverse_edge)
else:
if 'valid_mask' not in self.g.edges[etype].data:
train_idx = self.g.edges[etype].data['train_mask']
random_int = th.randperm(int(train_idx.sum()))
val_index = random_int[:int(train_idx.sum() * val_ratio)]
val_edge = self.g.find_edges(val_index, etype)
else:
val_mask = self.g.edges[etype].data['valid_mask'].squeeze()
val_index = th.nonzero(val_mask).squeeze()
val_edge = self.g.find_edges(val_index, etype)
test_mask = self.g.edges[etype].data['test_mask'].squeeze()
test_index = th.nonzero(test_mask).squeeze()
test_edge = self.g.find_edges(test_index, etype)
val_edge_dict[etype] = val_edge
test_edge_dict[etype] = test_edge
out_ntypes.append(etype[0])
out_ntypes.append(etype[2])
#self.val_label = train_graph.edges[etype[1]].data['label'][val_index]
self.test_label = train_graph.edges[etype[1]].data['label'][test_index]
train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), etype)
# train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), 'item-user')
self.out_ntypes = set(out_ntypes)
val_graph = dgl.heterograph(val_edge_dict,
{ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)})
test_graph = dgl.heterograph(test_edge_dict,
{ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)})
# todo: val/test negative graphs should be created before training rather than
# create them dynamically in every evaluation.
return train_graph, val_graph, test_graph, None, None
@register_dataset('demo_link_prediction')
class Test_LinkPrediction(LinkPredictionDataset):
def __init__(self, dataset_name):
super(Test_LinkPrediction, self).__init__()
self.g = self.load_HIN('./openhgnn/debug/data.bin')
self.target_link = 'user-item'
self.has_feature = False
self.meta_paths_dict = None
self.preprocess()
# self.generate_negative()
def preprocess(self):
test_mask = self.g.edges[self.target_link].data['test_mask']
index = th.nonzero(test_mask).squeeze()
self.test_edge = self.g.find_edges(index, self.target_link)
self.pos_test_graph = dgl.heterograph({('user', 'user-item', 'item'): self.test_edge},
{ntype: self.g.number_of_nodes(ntype) for ntype in ['user', 'item']})
self.g.remove_edges(index, self.target_link)
self.g.remove_edges(index, 'item-user')
self.neg_test_graph, _ = dgl.load_graphs('./openhgnn/debug/neg.bin')
self.neg_test_graph = self.neg_test_graph[0]
return
def generate_negative(self):
k = 99
e = self.pos_test_graph.edges()
neg_src = []
neg_dst = []
for i in range(self.pos_test_graph.number_of_edges()):
src = e[0][i]
exp = self.pos_test_graph.successors(src)
dst = th.randint(high=self.g.number_of_nodes('item'), size=(k,))
for d in range(len(dst)):
while dst[d] in exp:
dst[d] = th.randint(high=self.g.number_of_nodes('item'), size=(1,))
src = src.repeat_interleave(k)
neg_src.append(src)
neg_dst.append(dst)
neg_edge = (th.cat(neg_src), th.cat(neg_dst))
neg_graph = dgl.heterograph({('user', 'user-item', 'item'): neg_edge},
{ntype: self.g.number_of_nodes(ntype) for ntype in ['user', 'item']})
dgl.save_graphs('./openhgnn/debug/neg.bin', neg_graph)
@register_dataset('hin_link_prediction')
class HIN_LinkPrediction(LinkPredictionDataset):
def __init__(self, dataset_name, *args, **kwargs):
super(HIN_LinkPrediction, self).__init__(*args, **kwargs)
self.g = self.load_HIN(dataset_name)
def load_link_pred(self, path):
u_list = []
v_list = []
label_list = []
with open(path) as f:
for i in f.readlines():
u, v, label = i.strip().split(', ')
u_list.append(int(u))
v_list.append(int(v))
label_list.append(int(label))
return u_list, v_list, label_list
def load_HIN(self, dataset_name):
self.dataset_name = dataset_name
if dataset_name == 'academic4HetGNN':
# which is used in HetGNN
dataset = AcademicDataset(name='academic4HetGNN', raw_dir='')
g = dataset[0].long()
self.train_batch = self.load_link_pred('./openhgnn/dataset/' + dataset_name + '/a_a_list_train.txt')
self.test_batch = self.load_link_pred('./openhgnn/dataset/' + dataset_name + '/a_a_list_test.txt')
self.category = 'author'
elif dataset_name == 'Book-Crossing':
g, _ = dgl.load_graphs('./openhgnn/dataset/book_graph.bin')
g = g[0]
self.target_link = [('user', 'user-item', 'item')]
self.node_type = ['user', 'item']
elif dataset_name == 'amazon4SLICE':
dataset = AcademicDataset(name='amazon4SLICE', raw_dir='')
g = dataset[0].long()
elif dataset_name == 'MTWM':
dataset = AcademicDataset(name='MTWM', raw_dir='')
g = dataset[0].long()
g = add_reverse_edges(g)
self.target_link = [('user', 'user-buy-spu', 'spu')]
self.target_link_r = [('spu', 'user-buy-spu-rev', 'user')]
self.meta_paths_dict = {
'UPU1':[('user','user-buy-poi','poi'),('poi','user-buy-poi-rev','user')],
'UPU2':[('user','user-click-poi','poi'),('poi','user-click-poi-rev','user')],
'USU':[('user','user-buy-spu','spu'),('spu','user-buy-spu-rev','user')],
'UPSPU1': [('user','user-buy-poi','poi'),('poi','poi-contain-spu','spu'),
('spu','poi-contain-spu-rev','poi'),('poi','user-buy-poi-rev','user')
],
'UPSPU2':[
('user','user-click-poi','poi'), ('poi','poi-contain-spu','spu'),
('spu','poi-contain-spu-rev','poi'),('poi','user-click-poi-rev','user')
]
}
self.node_type = ['user', 'spu']
elif dataset_name == 'HGBl-ACM':
dataset = HGBDataset(name='HGBn-ACM', raw_dir='')
g = dataset[0].long()
self.has_feature = True
self.target_link = [('paper', 'paper-ref-paper', 'paper')]
self.node_type = ['author', 'paper', 'subject', 'term']
self.target_link_r = [('paper', 'paper-cite-paper', 'paper')]
self.meta_paths_dict = {'PAP': [('paper', 'paper-author', 'author'), ('author', 'author-paper', 'paper')],
'PSP': [('paper', 'paper-subject', 'subject'),
('subject', 'subject-paper', 'paper')],
'PcPAP': [('paper', 'paper-cite-paper', 'paper'),
('paper', 'paper-author', 'author'),
('author', 'author-paper', 'paper')],
'PcPSP': [('paper', 'paper-cite-paper', 'paper'),
('paper', 'paper-subject', 'subject'),
('subject', 'subject-paper', 'paper')],
'PrPAP': [('paper', 'paper-ref-paper', 'paper'),
('paper', 'paper-author', 'author'),
('author', 'author-paper', 'paper')],
'PrPSP': [('paper', 'paper-ref-paper', 'paper'),
('paper', 'paper-subject', 'subject'),
('subject', 'subject-paper', 'paper')]
}
elif dataset_name == 'HGBl-DBLP':
dataset = HGBDataset(name='HGBn-DBLP', raw_dir='')
g = dataset[0].long()
self.has_feature = True
self.target_link = [('author', 'author-paper', 'paper')]
self.node_type = ['author', 'paper', 'venue', 'term']
self.target_link_r = [('paper', 'paper-author', 'author')]
self.meta_paths_dict = {'APA': [('author', 'author-paper', 'paper'), ('paper', 'paper-author', 'author')],
'APTPA': [('author', 'author-paper', 'paper'), ('paper', 'paper-term', 'term'),
('term', 'term-paper', 'paper'), ('paper', 'paper-author', 'author')],
'APVPA': [('author', 'author-paper', 'paper'), ('paper', 'paper-venue', 'venue'),
('venue', 'venue-paper', 'paper'), ('paper', 'paper-author', 'author')],
'PAP': [('paper', 'paper-author', 'author'), ('author', 'author-paper', 'paper')],
'PTP': [('paper', 'paper-term', 'term'), ('term', 'term-paper', 'paper')],
'PVP': [('paper', 'paper-venue', 'venue'), ('venue', 'venue-paper', 'paper')],
}
elif dataset_name == 'HGBl-IMDB':
dataset = HGBDataset(name='HGBn-IMDB', raw_dir='')
g = dataset[0].long()
self.has_feature = True
# self.target_link = [('author', 'author-paper', 'paper')]
# self.node_type = ['author', 'paper', 'subject', 'term']
# self.target_link_r = [('paper', 'paper-author', 'author')]
self.target_link = [('actor', 'actor->movie', 'movie')]
self.node_type = ['actor', 'director', 'keyword', 'movie']
self.target_link_r = [('movie', 'movie->actor', 'actor')]
self.meta_paths_dict = {
'MAM': [('movie', 'movie->actor', 'actor'), ('actor', 'actor->movie', 'movie')],
'MDM': [('movie', 'movie->director', 'director'), ('director', 'director->movie', 'movie')],
'MKM': [('movie', 'movie->keyword', 'keyword'), ('keyword', 'keyword->movie', 'movie')],
# 'DMD': [('director', 'director->movie', 'movie'), ('movie', 'movie->director', 'director')],
# 'DMAMD': [('director', 'director->movie', 'movie'), ('movie', 'movie->actor', 'actor'),
# ('actor', 'actor->movie', 'movie'), ('movie', 'movie->director', 'director')],
'AMA': [('actor', 'actor->movie', 'movie'), ('movie', 'movie->actor', 'actor')],
'AMDMA': [('actor', 'actor->movie', 'movie'), ('movie', 'movie->director', 'director'),
('director', 'director->movie', 'movie'), ('movie', 'movie->actor', 'actor')]
}
return g
def get_split(self, val_ratio=0.1, test_ratio=0.2):
if self.dataset_name == 'academic4HetGNN':
return None, None, None, None, None
else:
return super(HIN_LinkPrediction, self).get_split(val_ratio, test_ratio)
@register_dataset('HGBl_link_prediction')
class HGB_LinkPrediction(LinkPredictionDataset):
r"""
The HGB dataset will be used in task *link prediction*.
Dataset Name :
HGBn-amazon/HGBn-LastFM/HGBn-PubMed
So if you want to get more information, refer to
`HGB datasets <https://github.com/THUDM/HGB>`_
Attributes
-----------
has_feature : bool
Whether the dataset has feature. Except HGBl-LastFM, others have features.
target_link : list of tuple[canonical_etypes]
The etypes of test link. HGBl-amazon has two etypes of test link. other has only one.
"""
def __init__(self, dataset_name, *args, **kwargs):
super(HGB_LinkPrediction, self).__init__(*args, **kwargs)
self.dataset_name = dataset_name
self.target_link_r = None
if dataset_name == 'HGBl-amazon':
dataset = HGBDataset(name=dataset_name, raw_dir='')
g = dataset[0].long()
self.has_feature = False
self.target_link = [('product', 'product-product-0', 'product'),
('product', 'product-product-1', 'product')]
self.target_link_r = None
self.link = [0, 1]
self.node_type = ["product"]
self.test_edge_type = {'product-product-0': 0, 'product-product-1': 1}
self.meta_paths_dict = {
'P0P': [('product', 'product-product-0', 'product'), ('product', 'product-product-1', 'product')],
'P1P': [('product', 'product-product-1', 'product'), ('product', 'product-product-0', 'product')]
}
elif dataset_name == 'HGBl-LastFM':
dataset = HGBDataset(name=dataset_name, raw_dir='')
g = dataset[0].long()
self.has_feature = False
self.target_link = [('user', 'user-artist', 'artist')]
self.node_type = ['user', 'artist', 'tag']
self.test_edge_type = {'user-artist': 0}
g = add_reverse_edges(g)
self.target_link_r = [('artist', 'user-artist-rev', 'user')]
self.meta_paths_dict = {'UU': [('user', 'user-user', 'user')],
'UAU': [('user', 'user-artist', 'artist'), ('artist', 'user-artist-rev', 'user')],
'UATAU': [('user', 'user-artist', 'artist'), ('artist', 'artist-tag', 'tag'),
('tag', 'artist-tag-rev', 'artist'),
('artist', 'user-artist-rev', 'user')],
'AUA': [('artist', 'user-artist-rev', 'user'), ('user', 'user-artist', 'artist')],
'ATA': [('artist', 'artist-tag', 'tag'), ('tag', 'artist-tag-rev', 'artist')]
}
elif dataset_name == 'HGBl-PubMed':
dataset = HGBDataset(name=dataset_name, raw_dir='')
g = dataset[0].long()
self.has_feature = True
self.target_link = [('1', '1_to_1', '1')]
self.node_type = ['0', '1', '2', '3']
self.test_edge_type = {'1_to_1': 2}
g = add_reverse_edges(g)
self.target_link_r = [('1', '1_to_1-rev', '1')]
self.meta_paths_dict = {'101': [('1', '0_to_1-rev', '0'), ('0', '0_to_1', '1')],
'111': [('1', '1_to_1', '1'), ('1', '1_to_1-rev', '1')],
'121': [('1', '2_to_1-rev', '2'), ('2', '2_to_1', '1')],
'131': [('1', '3_to_1-rev', '3'), ('3', '3_to_1', '1')]
}
self.g = g
self.shift_dict = self.calculate_node_shift()
def load_link_pred(self, path):
return
def calculate_node_shift(self):
node_shift_dict = {}
count = 0
for type in self.node_type:
node_shift_dict[type] = count
count += self.g.num_nodes(type)
return node_shift_dict
def get_split(self):
r"""
Get graphs for train, valid or test.
The dataset has not validation_mask, so we split train edges randomly.
"""
val_edge_dict = {}
test_edge_dict = {}
out_ntypes = []
train_graph = self.g
val_ratio = 0.1
for i, etype in enumerate(self.target_link):
train_mask = self.g.edges[etype].data['train_mask'].squeeze()
train_index = th.nonzero(train_mask).squeeze()
random_int = th.randperm(len(train_index))[:int(len(train_index) * val_ratio)]
val_index = train_index[random_int]
val_edge = self.g.find_edges(val_index, etype)
test_mask = self.g.edges[etype].data['test_mask'].squeeze()
test_index = th.nonzero(test_mask).squeeze()
test_edge = self.g.find_edges(test_index, etype)
val_edge_dict[etype] = val_edge
test_edge_dict[etype] = test_edge
out_ntypes.append(etype[0])
out_ntypes.append(etype[2])
train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), etype)
if self.target_link_r is None:
pass
else:
train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), self.target_link_r[i])
self.out_ntypes = set(out_ntypes)
val_graph = dgl.heterograph(val_edge_dict,
{ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)})
test_graph = dgl.heterograph(test_edge_dict,
{ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)})
return train_graph, val_graph, test_graph, None, None
def save_results(self, hg, score, file_path):
with hg.local_scope():
src_list = []
dst_list = []
edge_type_list = []
for etype in hg.canonical_etypes:
edges = hg.edges(etype=etype)
src_id = edges[0] + self.shift_dict[etype[0]]
dst_id = edges[1] + self.shift_dict[etype[2]]
src_list.append(src_id)
dst_list.append(dst_id)
edge_type_list.append(th.full((src_id.shape[0],), self.test_edge_type[etype[1]]))
src_list = th.cat(src_list)
dst_list = th.cat(dst_list)
edge_type_list = th.cat(edge_type_list)
with open(file_path, "w") as f:
for l, r, edge_type, c in zip(src_list, dst_list, edge_type_list, score):
f.write(f"{l}\t{r}\t{edge_type}\t{round(float(c), 4)}\n")
@register_dataset('ohgb_link_prediction')
class OHGB_LinkPrediction(LinkPredictionDataset):
def __init__(self, dataset_name, *args, **kwargs):
super(OHGB_LinkPrediction, self).__init__(*args, **kwargs)
self.dataset_name = dataset_name
self.has_feature = True
if dataset_name == 'ohgbl-MTWM':
dataset = OHGBDataset(name=dataset_name, raw_dir='')
g = dataset[0].long()
self.target_link = [('user', 'user-buy-spu', 'spu')]
self.target_link_r = [('spu', 'user-buy-spu-rev', 'user')]
self.node_type = ['user', 'spu']
elif dataset_name == 'ohgbl-yelp1':
dataset = OHGBDataset(name=dataset_name, raw_dir='')
g = dataset[0].long()
self.target_link = [('user', 'user-buy-business', 'business')]
self.target_link_r = [('business', 'user-buy-business-rev', 'user')]
elif dataset_name == 'ohgbl-yelp2':
dataset = OHGBDataset(name=dataset_name, raw_dir='')
g = dataset[0].long()
self.target_link = [('business', 'described-with', 'phrase')]
self.target_link_r = [('business', 'described-with-rev', 'phrase')]
elif dataset_name == 'ohgbl-Freebase':
dataset = OHGBDataset(name=dataset_name, raw_dir='')
g = dataset[0].long()
self.target_link = [('BOOK','BOOK-and-BOOK','BOOK')]
self.target_link_r = [('BOOK','BOOK-and-BOOK-rev','BOOK')]
self.g = g
def build_graph_from_triplets(num_nodes, num_rels, triplets):
""" Create a DGL graph. The graph is bidirectional because RGCN authors
use reversed relations.
This function also generates edge type and normalization factor
(reciprocal of node incoming degree)
"""
g = dgl.graph(([], []))
g.add_nodes(num_nodes)
src, rel, dst = triplets
src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
rel = np.concatenate((rel, rel + num_rels))
edges = sorted(zip(dst, src, rel))
dst, src, rel = np.array(edges).transpose()
g.add_edges(src, dst)
norm = comp_deg_norm(g)
print("# nodes: {}, # edges: {}".format(num_nodes, len(src)))
return g, rel.astype('int64'), norm.astype('int64')
def comp_deg_norm(g):
g = g.local_var()
in_deg = g.in_degrees(range(g.number_of_nodes())).float().numpy()
norm = 1.0 / in_deg
norm[np.isinf(norm)] = 0
return norm
@register_dataset('kg_link_prediction')
class KG_LinkPrediction(LinkPredictionDataset):
"""
From `RGCN <https://arxiv.org/abs/1703.06103>`_, WN18 & FB15k face a data leakage.
"""
def __init__(self, dataset_name, *args, **kwargs):
super(KG_LinkPrediction, self).__init__(*args, **kwargs)
if dataset_name in ['wn18', 'FB15k', 'FB15k-237']:
dataset = load_data(dataset_name)
g = dataset[0]
self.num_rels = dataset.num_rels
self.num_nodes = dataset.num_nodes
self.train_hg, self.train_triplets = self._build_hg(g, 'train')
self.valid_hg, self.valid_triplets = self._build_hg(g, 'valid')
self.test_hg, self.test_triplets = self._build_hg(g, 'test')
self.g = self.train_hg
self.category = '_N'
self.target_link = self.test_hg.canonical_etypes
def _build_hg(self, g, mode):
sub_g = dgl.edge_subgraph(g, g.edata[mode+'_edge_mask'], relabel_nodes=False)
src, dst = sub_g.edges()
etype = sub_g.edata['etype']
edge_dict = {}
for i in range(self.num_rels):
mask = (etype == i)
edge_name = ('_N', str(i), '_N')
edge_dict[edge_name] = (src[mask], dst[mask])
hg = dgl.heterograph(edge_dict, {'_N': self.num_nodes})
return hg, th.stack((src, etype, dst)).T
def modify_size(self, eval_percent, dataset_type):
if dataset_type == 'valid':
self.valid_triplets = th.tensor(random.sample(self.valid_triplets.tolist(), math.ceil(self.valid_triplets.shape[0]*eval_percent)))
elif dataset_type == 'test':
self.test_triplets = th.tensor(random.sample(self.test_triplets.tolist(), math.ceil(self.test_triplets.shape[0]*eval_percent)))
def get_graph_directed_from_triples(self, triples, format='graph'):
s = th.LongTensor(triples[:, 0])
r = th.LongTensor(triples[:, 1])
o = th.LongTensor(triples[:, 2])
if format == 'graph':
edge_dict = {}
for i in range(self.num_rels):
mask = (r == i)
edge_name = (self.category, str(i), self.category)
edge_dict[edge_name] = (s[mask], o[mask])
return dgl.heterograph(edge_dict, {self.category: self.num_nodes})
def get_triples(self, g, mask_mode):
'''
:param g:
:param mask_mode: should be one of 'train_mask', 'val_mask', 'test_mask
:return:
'''
edges = g.edges()
etype = g.edata['etype']
mask = g.edata.pop(mask_mode)
return th.stack((edges[0][mask], etype[mask], edges[1][mask]))
def get_all_triplets(self, dataset):
train_data = th.LongTensor(dataset.train)
valid_data = th.LongTensor(dataset.valid)
test_data = th.LongTensor(dataset.test)
return train_data, valid_data, test_data
def get_split(self):
return self.train_hg, self.valid_hg, self.test_hg, None, None
def split_graph(self, g, mode='train'):
"""
Parameters
----------
g: DGLGraph
a homogeneous graph fomat
mode: str
split the subgraph according to the mode
Returns
-------
hg: DGLHeterograph
"""
edges = g.edges()
etype = g.edata['etype']
if mode == 'train':
mask = g.edata['train_mask']
elif mode == 'valid':
mask = g.edata['valid_edge_mask']
elif mode == 'test':
mask = g.edata['test_edge_mask']
hg = self.build_graph((edges[0][mask], edges[1][mask]), etype[mask])
return hg
def build_graph(self, edges, etype):
edge_dict = {}
for i in range(self.num_rels):
mask = (etype == i)
edge_name = (self.category, str(i), self.category)
edge_dict[edge_name] = (edges[0][mask], edges[1][mask])
hg = dgl.heterograph(edge_dict, {self.category: self.num_nodes})
return hg
def build_g(self, train):
s = train[:, 0]
r = train[:, 1]
o = train[:, 2]
edge_dict = {}
for i in range(self.num_rels):
mask = (r == i)
edge_name = (self.category, str(i), self.category)
edge_dict[edge_name] = (th.LongTensor(s[mask]), th.LongTensor(o[mask]))
hg = dgl.heterograph(edge_dict, {self.category: self.num_nodes})
return hg
class kg_sampler():
def __init__(self, ):
self.sampler = 'uniform'
return
def generate_sampled_graph_and_labels(self, triplets, sample_size, split_size,
num_rels, adj_list, degrees,
negative_rate, sampler="uniform"):
"""Get training graph and signals
First perform edge neighborhood sampling on graph, then perform negative
sampling to generate negative samples
"""
# perform edge neighbor sampling
if self.sampler == "uniform":
edges = sample_edge_uniform(adj_list, degrees, len(triplets), sample_size)
elif self.sampler == "neighbor":
edges = sample_edge_neighborhood(adj_list, degrees, len(triplets), sample_size)
else:
raise ValueError("Sampler type must be either 'uniform' or 'neighbor'.")
# relabel nodes to have consecutive node ids
edges = triplets[edges]
src, rel, dst = edges.transpose()
uniq_v, edges = np.unique((src, dst), return_inverse=True)
src, dst = np.reshape(edges, (2, -1))
relabeled_edges = np.stack((src, rel, dst)).transpose()
# negative sampling
samples, labels = negative_sampling(relabeled_edges, len(uniq_v),
negative_rate)
# further split graph, only half of the edges will be used as graph
# structure, while the rest half is used as unseen positive samples
split_size = int(sample_size * split_size)
graph_split_ids = np.random.choice(np.arange(sample_size),
size=split_size, replace=False)
src = src[graph_split_ids]
dst = dst[graph_split_ids]
rel = rel[graph_split_ids]
# build DGL graph
print("# sampled nodes: {}".format(len(uniq_v)))
print("# sampled edges: {}".format(len(src) * 2))
g, rel, norm = build_graph_from_triplets(len(uniq_v), num_rels,
(src, rel, dst))
return g, uniq_v, rel, norm, samples, labels
def sample_edge_neighborhood(adj_list, degrees, n_triplets, sample_size):
"""Sample edges by neighborhool expansion.
This guarantees that the sampled edges form a connected graph, which
may help deeper GNNs that require information from more than one hop.
"""
edges = np.zeros((sample_size), dtype=np.int32)
# initialize
sample_counts = np.array([d for d in degrees])
picked = np.array([False for _ in range(n_triplets)])
seen = np.array([False for _ in degrees])
for i in range(0, sample_size):
weights = sample_counts * seen
if np.sum(weights) == 0:
weights = np.ones_like(weights)
weights[np.where(sample_counts == 0)] = 0
probabilities = (weights) / np.sum(weights)
chosen_vertex = np.random.choice(np.arange(degrees.shape[0]),
p=probabilities)
chosen_adj_list = adj_list[chosen_vertex]
seen[chosen_vertex] = True
chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
chosen_edge = chosen_adj_list[chosen_edge]
edge_number = chosen_edge[0]
while picked[edge_number]:
chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
chosen_edge = chosen_adj_list[chosen_edge]
edge_number = chosen_edge[0]
edges[i] = edge_number
other_vertex = chosen_edge[1]
picked[edge_number] = True
sample_counts[chosen_vertex] -= 1
sample_counts[other_vertex] -= 1
seen[other_vertex] = True
return edges
def sample_edge_uniform(adj_list, degrees, n_triplets, sample_size):
"""Sample edges uniformly from all the edges."""
all_edges = np.arange(n_triplets)
return np.random.choice(all_edges, sample_size, replace=False)