openhgnn.dataset.LinkPredictionDataset 源代码

import dgl
import math
import random
import numpy as np
import torch as th
from dgl.data.knowledge_graph import load_data
from . import BaseDataset, register_dataset
from . import AcademicDataset, HGBDataset, OHGBDataset
from ..utils import add_reverse_edges

__all__ = ['LinkPredictionDataset', 'HGB_LinkPrediction']

[文档]@register_dataset('link_prediction') class LinkPredictionDataset(BaseDataset): """ metric: Accuracy, multi-label f1 or multi-class f1. Default: `accuracy` """ def __init__(self, *args, **kwargs): super(LinkPredictionDataset, self).__init__(*args, **kwargs) self.target_link = None self.target_link_r = None def get_split(self, val_ratio=0.1, test_ratio=0.2): """ Get subgraphs for train, valid and test. Generally, the original will have train_mask and test_mask in edata, or we will split it automatically. If the original graph do not has the train_mask in edata, we default that there is no valid_mask and test_mask. So we will split the edges of the original graph into train/valid/test 0.7/0.1/0.2. The dataset has not validation_mask, so we split train edges randomly. Parameters ---------- val_ratio : int The ratio of validation. Default: 0.1 test_ratio : int The ratio of test. Default: 0.2 Returns ------- train_hg """ val_edge_dict = {} test_edge_dict = {} out_ntypes = [] train_graph = self.g for i, etype in enumerate(self.target_link): num_edges = self.g.num_edges(etype) if 'train_mask' not in self.g.edges[etype].data: """ split edges into train/valid/test. """ random_int = th.randperm(num_edges) val_index = random_int[:int(num_edges * val_ratio)] val_edge = self.g.find_edges(val_index, etype) test_index = random_int[int(num_edges * val_ratio):int(num_edges * (test_ratio + val_ratio))] test_edge = self.g.find_edges(test_index, etype) val_edge_dict[etype] = val_edge test_edge_dict[etype] = test_edge out_ntypes.append(etype[0]) out_ntypes.append(etype[2]) train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), etype) # train_graph = dgl.remove_edges(train_graph, val_index, etype) if self.target_link_r is None: pass else: reverse_edge = self.target_link_r[i] train_graph = dgl.remove_edges(train_graph, th.arange(train_graph.num_edges(reverse_edge)), reverse_edge) edges = train_graph.edges(etype=etype) train_graph = dgl.add_edges(train_graph, edges[1], edges[0], etype=reverse_edge) else: if 'valid_mask' not in self.g.edges[etype].data: train_idx = self.g.edges[etype].data['train_mask'] random_int = th.randperm(int(train_idx.sum())) val_index = random_int[:int(train_idx.sum() * val_ratio)] val_edge = self.g.find_edges(val_index, etype) else: val_mask = self.g.edges[etype].data['valid_mask'].squeeze() val_index = th.nonzero(val_mask).squeeze() val_edge = self.g.find_edges(val_index, etype) test_mask = self.g.edges[etype].data['test_mask'].squeeze() test_index = th.nonzero(test_mask).squeeze() test_edge = self.g.find_edges(test_index, etype) val_edge_dict[etype] = val_edge test_edge_dict[etype] = test_edge out_ntypes.append(etype[0]) out_ntypes.append(etype[2]) #self.val_label = train_graph.edges[etype[1]].data['label'][val_index] self.test_label = train_graph.edges[etype[1]].data['label'][test_index] train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), etype) # train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), 'item-user') self.out_ntypes = set(out_ntypes) val_graph = dgl.heterograph(val_edge_dict, {ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)}) test_graph = dgl.heterograph(test_edge_dict, {ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)}) # todo: val/test negative graphs should be created before training rather than # create them dynamically in every evaluation. return train_graph, val_graph, test_graph, None, None
@register_dataset('demo_link_prediction') class Test_LinkPrediction(LinkPredictionDataset): def __init__(self, dataset_name): super(Test_LinkPrediction, self).__init__() self.g = self.load_HIN('./openhgnn/debug/data.bin') self.target_link = 'user-item' self.has_feature = False self.meta_paths_dict = None self.preprocess() # self.generate_negative() def preprocess(self): test_mask = self.g.edges[self.target_link].data['test_mask'] index = th.nonzero(test_mask).squeeze() self.test_edge = self.g.find_edges(index, self.target_link) self.pos_test_graph = dgl.heterograph({('user', 'user-item', 'item'): self.test_edge}, {ntype: self.g.number_of_nodes(ntype) for ntype in ['user', 'item']}) self.g.remove_edges(index, self.target_link) self.g.remove_edges(index, 'item-user') self.neg_test_graph, _ = dgl.load_graphs('./openhgnn/debug/neg.bin') self.neg_test_graph = self.neg_test_graph[0] return def generate_negative(self): k = 99 e = self.pos_test_graph.edges() neg_src = [] neg_dst = [] for i in range(self.pos_test_graph.number_of_edges()): src = e[0][i] exp = self.pos_test_graph.successors(src) dst = th.randint(high=self.g.number_of_nodes('item'), size=(k,)) for d in range(len(dst)): while dst[d] in exp: dst[d] = th.randint(high=self.g.number_of_nodes('item'), size=(1,)) src = src.repeat_interleave(k) neg_src.append(src) neg_dst.append(dst) neg_edge = (th.cat(neg_src), th.cat(neg_dst)) neg_graph = dgl.heterograph({('user', 'user-item', 'item'): neg_edge}, {ntype: self.g.number_of_nodes(ntype) for ntype in ['user', 'item']}) dgl.save_graphs('./openhgnn/debug/neg.bin', neg_graph) @register_dataset('hin_link_prediction') class HIN_LinkPrediction(LinkPredictionDataset): def __init__(self, dataset_name, *args, **kwargs): super(HIN_LinkPrediction, self).__init__(*args, **kwargs) self.g = self.load_HIN(dataset_name) def load_link_pred(self, path): u_list = [] v_list = [] label_list = [] with open(path) as f: for i in f.readlines(): u, v, label = i.strip().split(', ') u_list.append(int(u)) v_list.append(int(v)) label_list.append(int(label)) return u_list, v_list, label_list def load_HIN(self, dataset_name): self.dataset_name = dataset_name if dataset_name == 'academic4HetGNN': # which is used in HetGNN dataset = AcademicDataset(name='academic4HetGNN', raw_dir='') g = dataset[0].long() self.train_batch = self.load_link_pred('./openhgnn/dataset/' + dataset_name + '/a_a_list_train.txt') self.test_batch = self.load_link_pred('./openhgnn/dataset/' + dataset_name + '/a_a_list_test.txt') self.category = 'author' elif dataset_name == 'Book-Crossing': g, _ = dgl.load_graphs('./openhgnn/dataset/book_graph.bin') g = g[0] self.target_link = [('user', 'user-item', 'item')] self.node_type = ['user', 'item'] elif dataset_name == 'amazon4SLICE': dataset = AcademicDataset(name='amazon4SLICE', raw_dir='') g = dataset[0].long() elif dataset_name == 'MTWM': dataset = AcademicDataset(name='MTWM', raw_dir='') g = dataset[0].long() g = add_reverse_edges(g) self.target_link = [('user', 'user-buy-spu', 'spu')] self.target_link_r = [('spu', 'user-buy-spu-rev', 'user')] self.meta_paths_dict = { 'UPU1':[('user','user-buy-poi','poi'),('poi','user-buy-poi-rev','user')], 'UPU2':[('user','user-click-poi','poi'),('poi','user-click-poi-rev','user')], 'USU':[('user','user-buy-spu','spu'),('spu','user-buy-spu-rev','user')], 'UPSPU1': [('user','user-buy-poi','poi'),('poi','poi-contain-spu','spu'), ('spu','poi-contain-spu-rev','poi'),('poi','user-buy-poi-rev','user') ], 'UPSPU2':[ ('user','user-click-poi','poi'), ('poi','poi-contain-spu','spu'), ('spu','poi-contain-spu-rev','poi'),('poi','user-click-poi-rev','user') ] } self.node_type = ['user', 'spu'] elif dataset_name == 'HGBl-ACM': dataset = HGBDataset(name='HGBn-ACM', raw_dir='') g = dataset[0].long() self.has_feature = True self.target_link = [('paper', 'paper-ref-paper', 'paper')] self.node_type = ['author', 'paper', 'subject', 'term'] self.target_link_r = [('paper', 'paper-cite-paper', 'paper')] self.meta_paths_dict = {'PAP': [('paper', 'paper-author', 'author'), ('author', 'author-paper', 'paper')], 'PSP': [('paper', 'paper-subject', 'subject'), ('subject', 'subject-paper', 'paper')], 'PcPAP': [('paper', 'paper-cite-paper', 'paper'), ('paper', 'paper-author', 'author'), ('author', 'author-paper', 'paper')], 'PcPSP': [('paper', 'paper-cite-paper', 'paper'), ('paper', 'paper-subject', 'subject'), ('subject', 'subject-paper', 'paper')], 'PrPAP': [('paper', 'paper-ref-paper', 'paper'), ('paper', 'paper-author', 'author'), ('author', 'author-paper', 'paper')], 'PrPSP': [('paper', 'paper-ref-paper', 'paper'), ('paper', 'paper-subject', 'subject'), ('subject', 'subject-paper', 'paper')] } elif dataset_name == 'HGBl-DBLP': dataset = HGBDataset(name='HGBn-DBLP', raw_dir='') g = dataset[0].long() self.has_feature = True self.target_link = [('author', 'author-paper', 'paper')] self.node_type = ['author', 'paper', 'venue', 'term'] self.target_link_r = [('paper', 'paper-author', 'author')] self.meta_paths_dict = {'APA': [('author', 'author-paper', 'paper'), ('paper', 'paper-author', 'author')], 'APTPA': [('author', 'author-paper', 'paper'), ('paper', 'paper-term', 'term'), ('term', 'term-paper', 'paper'), ('paper', 'paper-author', 'author')], 'APVPA': [('author', 'author-paper', 'paper'), ('paper', 'paper-venue', 'venue'), ('venue', 'venue-paper', 'paper'), ('paper', 'paper-author', 'author')], 'PAP': [('paper', 'paper-author', 'author'), ('author', 'author-paper', 'paper')], 'PTP': [('paper', 'paper-term', 'term'), ('term', 'term-paper', 'paper')], 'PVP': [('paper', 'paper-venue', 'venue'), ('venue', 'venue-paper', 'paper')], } elif dataset_name == 'HGBl-IMDB': dataset = HGBDataset(name='HGBn-IMDB', raw_dir='') g = dataset[0].long() self.has_feature = True # self.target_link = [('author', 'author-paper', 'paper')] # self.node_type = ['author', 'paper', 'subject', 'term'] # self.target_link_r = [('paper', 'paper-author', 'author')] self.target_link = [('actor', 'actor->movie', 'movie')] self.node_type = ['actor', 'director', 'keyword', 'movie'] self.target_link_r = [('movie', 'movie->actor', 'actor')] self.meta_paths_dict = { 'MAM': [('movie', 'movie->actor', 'actor'), ('actor', 'actor->movie', 'movie')], 'MDM': [('movie', 'movie->director', 'director'), ('director', 'director->movie', 'movie')], 'MKM': [('movie', 'movie->keyword', 'keyword'), ('keyword', 'keyword->movie', 'movie')], # 'DMD': [('director', 'director->movie', 'movie'), ('movie', 'movie->director', 'director')], # 'DMAMD': [('director', 'director->movie', 'movie'), ('movie', 'movie->actor', 'actor'), # ('actor', 'actor->movie', 'movie'), ('movie', 'movie->director', 'director')], 'AMA': [('actor', 'actor->movie', 'movie'), ('movie', 'movie->actor', 'actor')], 'AMDMA': [('actor', 'actor->movie', 'movie'), ('movie', 'movie->director', 'director'), ('director', 'director->movie', 'movie'), ('movie', 'movie->actor', 'actor')] } return g def get_split(self, val_ratio=0.1, test_ratio=0.2): if self.dataset_name == 'academic4HetGNN': return None, None, None, None, None else: return super(HIN_LinkPrediction, self).get_split(val_ratio, test_ratio) @register_dataset('HGBl_link_prediction') class HGB_LinkPrediction(LinkPredictionDataset): r""" The HGB dataset will be used in task *link prediction*. Dataset Name : HGBn-amazon/HGBn-LastFM/HGBn-PubMed So if you want to get more information, refer to `HGB datasets <https://github.com/THUDM/HGB>`_ Attributes ----------- has_feature : bool Whether the dataset has feature. Except HGBl-LastFM, others have features. target_link : list of tuple[canonical_etypes] The etypes of test link. HGBl-amazon has two etypes of test link. other has only one. """ def __init__(self, dataset_name, *args, **kwargs): super(HGB_LinkPrediction, self).__init__(*args, **kwargs) self.dataset_name = dataset_name self.target_link_r = None if dataset_name == 'HGBl-amazon': dataset = HGBDataset(name=dataset_name, raw_dir='') g = dataset[0].long() self.has_feature = False self.target_link = [('product', 'product-product-0', 'product'), ('product', 'product-product-1', 'product')] self.target_link_r = None self.link = [0, 1] self.node_type = ["product"] self.test_edge_type = {'product-product-0': 0, 'product-product-1': 1} self.meta_paths_dict = { 'P0P': [('product', 'product-product-0', 'product'), ('product', 'product-product-1', 'product')], 'P1P': [('product', 'product-product-1', 'product'), ('product', 'product-product-0', 'product')] } elif dataset_name == 'HGBl-LastFM': dataset = HGBDataset(name=dataset_name, raw_dir='') g = dataset[0].long() self.has_feature = False self.target_link = [('user', 'user-artist', 'artist')] self.node_type = ['user', 'artist', 'tag'] self.test_edge_type = {'user-artist': 0} g = add_reverse_edges(g) self.target_link_r = [('artist', 'user-artist-rev', 'user')] self.meta_paths_dict = {'UU': [('user', 'user-user', 'user')], 'UAU': [('user', 'user-artist', 'artist'), ('artist', 'user-artist-rev', 'user')], 'UATAU': [('user', 'user-artist', 'artist'), ('artist', 'artist-tag', 'tag'), ('tag', 'artist-tag-rev', 'artist'), ('artist', 'user-artist-rev', 'user')], 'AUA': [('artist', 'user-artist-rev', 'user'), ('user', 'user-artist', 'artist')], 'ATA': [('artist', 'artist-tag', 'tag'), ('tag', 'artist-tag-rev', 'artist')] } elif dataset_name == 'HGBl-PubMed': dataset = HGBDataset(name=dataset_name, raw_dir='') g = dataset[0].long() self.has_feature = True self.target_link = [('1', '1_to_1', '1')] self.node_type = ['0', '1', '2', '3'] self.test_edge_type = {'1_to_1': 2} g = add_reverse_edges(g) self.target_link_r = [('1', '1_to_1-rev', '1')] self.meta_paths_dict = {'101': [('1', '0_to_1-rev', '0'), ('0', '0_to_1', '1')], '111': [('1', '1_to_1', '1'), ('1', '1_to_1-rev', '1')], '121': [('1', '2_to_1-rev', '2'), ('2', '2_to_1', '1')], '131': [('1', '3_to_1-rev', '3'), ('3', '3_to_1', '1')] } self.g = g self.shift_dict = self.calculate_node_shift() def load_link_pred(self, path): return def calculate_node_shift(self): node_shift_dict = {} count = 0 for type in self.node_type: node_shift_dict[type] = count count += self.g.num_nodes(type) return node_shift_dict def get_split(self): r""" Get graphs for train, valid or test. The dataset has not validation_mask, so we split train edges randomly. """ val_edge_dict = {} test_edge_dict = {} out_ntypes = [] train_graph = self.g val_ratio = 0.1 for i, etype in enumerate(self.target_link): train_mask = self.g.edges[etype].data['train_mask'].squeeze() train_index = th.nonzero(train_mask).squeeze() random_int = th.randperm(len(train_index))[:int(len(train_index) * val_ratio)] val_index = train_index[random_int] val_edge = self.g.find_edges(val_index, etype) test_mask = self.g.edges[etype].data['test_mask'].squeeze() test_index = th.nonzero(test_mask).squeeze() test_edge = self.g.find_edges(test_index, etype) val_edge_dict[etype] = val_edge test_edge_dict[etype] = test_edge out_ntypes.append(etype[0]) out_ntypes.append(etype[2]) train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), etype) if self.target_link_r is None: pass else: train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), self.target_link_r[i]) self.out_ntypes = set(out_ntypes) val_graph = dgl.heterograph(val_edge_dict, {ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)}) test_graph = dgl.heterograph(test_edge_dict, {ntype: self.g.number_of_nodes(ntype) for ntype in set(out_ntypes)}) return train_graph, val_graph, test_graph, None, None def save_results(self, hg, score, file_path): with hg.local_scope(): src_list = [] dst_list = [] edge_type_list = [] for etype in hg.canonical_etypes: edges = hg.edges(etype=etype) src_id = edges[0] + self.shift_dict[etype[0]] dst_id = edges[1] + self.shift_dict[etype[2]] src_list.append(src_id) dst_list.append(dst_id) edge_type_list.append(th.full((src_id.shape[0],), self.test_edge_type[etype[1]])) src_list = th.cat(src_list) dst_list = th.cat(dst_list) edge_type_list = th.cat(edge_type_list) with open(file_path, "w") as f: for l, r, edge_type, c in zip(src_list, dst_list, edge_type_list, score): f.write(f"{l}\t{r}\t{edge_type}\t{round(float(c), 4)}\n") @register_dataset('ohgb_link_prediction') class OHGB_LinkPrediction(LinkPredictionDataset): def __init__(self, dataset_name, *args, **kwargs): super(OHGB_LinkPrediction, self).__init__(*args, **kwargs) self.dataset_name = dataset_name self.has_feature = True if dataset_name == 'ohgbl-MTWM': dataset = OHGBDataset(name=dataset_name, raw_dir='') g = dataset[0].long() self.target_link = [('user', 'user-buy-spu', 'spu')] self.target_link_r = [('spu', 'user-buy-spu-rev', 'user')] self.node_type = ['user', 'spu'] elif dataset_name == 'ohgbl-yelp1': dataset = OHGBDataset(name=dataset_name, raw_dir='') g = dataset[0].long() self.target_link = [('user', 'user-buy-business', 'business')] self.target_link_r = [('business', 'user-buy-business-rev', 'user')] elif dataset_name == 'ohgbl-yelp2': dataset = OHGBDataset(name=dataset_name, raw_dir='') g = dataset[0].long() self.target_link = [('business', 'described-with', 'phrase')] self.target_link_r = [('business', 'described-with-rev', 'phrase')] elif dataset_name == 'ohgbl-Freebase': dataset = OHGBDataset(name=dataset_name, raw_dir='') g = dataset[0].long() self.target_link = [('BOOK','BOOK-and-BOOK','BOOK')] self.target_link_r = [('BOOK','BOOK-and-BOOK-rev','BOOK')] self.g = g def build_graph_from_triplets(num_nodes, num_rels, triplets): """ Create a DGL graph. The graph is bidirectional because RGCN authors use reversed relations. This function also generates edge type and normalization factor (reciprocal of node incoming degree) """ g = dgl.graph(([], [])) g.add_nodes(num_nodes) src, rel, dst = triplets src, dst = np.concatenate((src, dst)), np.concatenate((dst, src)) rel = np.concatenate((rel, rel + num_rels)) edges = sorted(zip(dst, src, rel)) dst, src, rel = np.array(edges).transpose() g.add_edges(src, dst) norm = comp_deg_norm(g) print("# nodes: {}, # edges: {}".format(num_nodes, len(src))) return g, rel.astype('int64'), norm.astype('int64') def comp_deg_norm(g): g = g.local_var() in_deg = g.in_degrees(range(g.number_of_nodes())).float().numpy() norm = 1.0 / in_deg norm[np.isinf(norm)] = 0 return norm @register_dataset('kg_link_prediction') class KG_LinkPrediction(LinkPredictionDataset): """ From `RGCN <https://arxiv.org/abs/1703.06103>`_, WN18 & FB15k face a data leakage. """ def __init__(self, dataset_name, *args, **kwargs): super(KG_LinkPrediction, self).__init__(*args, **kwargs) if dataset_name in ['wn18', 'FB15k', 'FB15k-237']: dataset = load_data(dataset_name) g = dataset[0] self.num_rels = dataset.num_rels self.num_nodes = dataset.num_nodes self.train_hg, self.train_triplets = self._build_hg(g, 'train') self.valid_hg, self.valid_triplets = self._build_hg(g, 'valid') self.test_hg, self.test_triplets = self._build_hg(g, 'test') self.g = self.train_hg self.category = '_N' self.target_link = self.test_hg.canonical_etypes def _build_hg(self, g, mode): sub_g = dgl.edge_subgraph(g, g.edata[mode+'_edge_mask'], relabel_nodes=False) src, dst = sub_g.edges() etype = sub_g.edata['etype'] edge_dict = {} for i in range(self.num_rels): mask = (etype == i) edge_name = ('_N', str(i), '_N') edge_dict[edge_name] = (src[mask], dst[mask]) hg = dgl.heterograph(edge_dict, {'_N': self.num_nodes}) return hg, th.stack((src, etype, dst)).T def modify_size(self, eval_percent, dataset_type): if dataset_type == 'valid': self.valid_triplets = th.tensor(random.sample(self.valid_triplets.tolist(), math.ceil(self.valid_triplets.shape[0]*eval_percent))) elif dataset_type == 'test': self.test_triplets = th.tensor(random.sample(self.test_triplets.tolist(), math.ceil(self.test_triplets.shape[0]*eval_percent))) def get_graph_directed_from_triples(self, triples, format='graph'): s = th.LongTensor(triples[:, 0]) r = th.LongTensor(triples[:, 1]) o = th.LongTensor(triples[:, 2]) if format == 'graph': edge_dict = {} for i in range(self.num_rels): mask = (r == i) edge_name = (self.category, str(i), self.category) edge_dict[edge_name] = (s[mask], o[mask]) return dgl.heterograph(edge_dict, {self.category: self.num_nodes}) def get_triples(self, g, mask_mode): ''' :param g: :param mask_mode: should be one of 'train_mask', 'val_mask', 'test_mask :return: ''' edges = g.edges() etype = g.edata['etype'] mask = g.edata.pop(mask_mode) return th.stack((edges[0][mask], etype[mask], edges[1][mask])) def get_all_triplets(self, dataset): train_data = th.LongTensor(dataset.train) valid_data = th.LongTensor(dataset.valid) test_data = th.LongTensor(dataset.test) return train_data, valid_data, test_data def get_split(self): return self.train_hg, self.valid_hg, self.test_hg, None, None def split_graph(self, g, mode='train'): """ Parameters ---------- g: DGLGraph a homogeneous graph fomat mode: str split the subgraph according to the mode Returns ------- hg: DGLHeterograph """ edges = g.edges() etype = g.edata['etype'] if mode == 'train': mask = g.edata['train_mask'] elif mode == 'valid': mask = g.edata['valid_edge_mask'] elif mode == 'test': mask = g.edata['test_edge_mask'] hg = self.build_graph((edges[0][mask], edges[1][mask]), etype[mask]) return hg def build_graph(self, edges, etype): edge_dict = {} for i in range(self.num_rels): mask = (etype == i) edge_name = (self.category, str(i), self.category) edge_dict[edge_name] = (edges[0][mask], edges[1][mask]) hg = dgl.heterograph(edge_dict, {self.category: self.num_nodes}) return hg def build_g(self, train): s = train[:, 0] r = train[:, 1] o = train[:, 2] edge_dict = {} for i in range(self.num_rels): mask = (r == i) edge_name = (self.category, str(i), self.category) edge_dict[edge_name] = (th.LongTensor(s[mask]), th.LongTensor(o[mask])) hg = dgl.heterograph(edge_dict, {self.category: self.num_nodes}) return hg class kg_sampler(): def __init__(self, ): self.sampler = 'uniform' return def generate_sampled_graph_and_labels(self, triplets, sample_size, split_size, num_rels, adj_list, degrees, negative_rate, sampler="uniform"): """Get training graph and signals First perform edge neighborhood sampling on graph, then perform negative sampling to generate negative samples """ # perform edge neighbor sampling if self.sampler == "uniform": edges = sample_edge_uniform(adj_list, degrees, len(triplets), sample_size) elif self.sampler == "neighbor": edges = sample_edge_neighborhood(adj_list, degrees, len(triplets), sample_size) else: raise ValueError("Sampler type must be either 'uniform' or 'neighbor'.") # relabel nodes to have consecutive node ids edges = triplets[edges] src, rel, dst = edges.transpose() uniq_v, edges = np.unique((src, dst), return_inverse=True) src, dst = np.reshape(edges, (2, -1)) relabeled_edges = np.stack((src, rel, dst)).transpose() # negative sampling samples, labels = negative_sampling(relabeled_edges, len(uniq_v), negative_rate) # further split graph, only half of the edges will be used as graph # structure, while the rest half is used as unseen positive samples split_size = int(sample_size * split_size) graph_split_ids = np.random.choice(np.arange(sample_size), size=split_size, replace=False) src = src[graph_split_ids] dst = dst[graph_split_ids] rel = rel[graph_split_ids] # build DGL graph print("# sampled nodes: {}".format(len(uniq_v))) print("# sampled edges: {}".format(len(src) * 2)) g, rel, norm = build_graph_from_triplets(len(uniq_v), num_rels, (src, rel, dst)) return g, uniq_v, rel, norm, samples, labels def sample_edge_neighborhood(adj_list, degrees, n_triplets, sample_size): """Sample edges by neighborhool expansion. This guarantees that the sampled edges form a connected graph, which may help deeper GNNs that require information from more than one hop. """ edges = np.zeros((sample_size), dtype=np.int32) # initialize sample_counts = np.array([d for d in degrees]) picked = np.array([False for _ in range(n_triplets)]) seen = np.array([False for _ in degrees]) for i in range(0, sample_size): weights = sample_counts * seen if np.sum(weights) == 0: weights = np.ones_like(weights) weights[np.where(sample_counts == 0)] = 0 probabilities = (weights) / np.sum(weights) chosen_vertex = np.random.choice(np.arange(degrees.shape[0]), p=probabilities) chosen_adj_list = adj_list[chosen_vertex] seen[chosen_vertex] = True chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0])) chosen_edge = chosen_adj_list[chosen_edge] edge_number = chosen_edge[0] while picked[edge_number]: chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0])) chosen_edge = chosen_adj_list[chosen_edge] edge_number = chosen_edge[0] edges[i] = edge_number other_vertex = chosen_edge[1] picked[edge_number] = True sample_counts[chosen_vertex] -= 1 sample_counts[other_vertex] -= 1 seen[other_vertex] = True return edges def sample_edge_uniform(adj_list, degrees, n_triplets, sample_size): """Sample edges uniformly from all the edges.""" all_edges = np.arange(n_triplets) return np.random.choice(all_edges, sample_size, replace=False)