openhgnn.dataset.RecommendationDataset 源代码

import os
import dgl
import torch as th
import numpy as np
from . import BaseDataset, register_dataset
from dgl.data.utils import download,load_graphs,save_graphs,save_info,load_info
from scipy.sparse import csr_matrix
import scipy.sparse as sp
from .multigraph import MultiGraphDataset
from ..sampler.negative_sampler import Uniform_exclusive
from . import AcademicDataset
from .HGCLDataset import HGCLDataset

#add more lib for KGAT
import time
import collections
import zipfile
import numpy as np
import pandas as pd

[文档]@register_dataset('recommendation') class RecommendationDataset(BaseDataset): """ """ def __init__(self,*args, **kwargs): super(RecommendationDataset, self).__init__(*args, **kwargs) self.meta_paths_dict = None
@register_dataset('kgcn_recommendation') class KGCN_Recommendation(RecommendationDataset): r""" Which is used in KGCN. """ def __init__(self, dataset_name, *args, **kwargs): super(RecommendationDataset, self).__init__(*args, **kwargs) dataset = MultiGraphDataset(name=dataset_name, raw_dir='') self.g = dataset[0].long() self.g_1 = dataset[1].long() def get_split(self, validation=True): ratingsGraph = self.g_1 n_edges = ratingsGraph.num_edges() random_int = th.randperm(n_edges) train_idx = random_int[:int(n_edges*0.6)] val_idx = random_int[int(n_edges*0.6):int(n_edges*0.8)] test_idx = random_int[int(n_edges*0.6):int(n_edges*0.8)] return train_idx, val_idx, test_idx def get_train_data(self): pass def get_labels(self): return self.label @register_dataset('hgcl_recommendation') class HGCLRecommendation(RecommendationDataset): def __init__(self, dataset_name, *args, **kwargs): super(RecommendationDataset, self).__init__(*args, **kwargs) dataset = HGCLDataset(name=dataset_name, raw_dir='') self.g = dataset[0].long() def get_split(self, validation=True): ratingsGraph = self.g n_edges = ratingsGraph.num_edges() random_int = th.randperm(n_edges) train_idx = random_int[:int(n_edges * 0.6)] val_idx = random_int[int(n_edges * 0.6):int(n_edges * 0.8)] test_idx = random_int[int(n_edges * 0.6):int(n_edges * 0.8)] return train_idx, val_idx, test_idx def get_train_data(self): pass def get_labels(self): return self.label @register_dataset('lightGCN_recommendation') class lightGCN_Recommendation(RecommendationDataset): def __init__(self, dataset_name, *args, **kwargs): super(RecommendationDataset, self).__init__(*args, **kwargs) if dataset_name not in ['gowalla','yelp2018','amazon-book']: raise KeyError('Dataset {} is not supported!'.format(dataset_name)) self.dataset_name=dataset_name self.data_path=f'openhgnn/dataset/{self.dataset_name}' if not os.path.exists(f"{self.data_path}/train.txt"): self.download() # test self.mode_dict = {'train': 0, "test": 1} self.mode = self.mode_dict['train'] self.n_user = 0 self.m_item = 0 path = './openhgnn/dataset/' + dataset_name train_file = path + '/train.txt' test_file = path + '/test.txt' self.path = path trainUniqueUsers, trainItem, trainUser = [], [], [] testUniqueUsers, testItem, testUser = [], [], [] self.traindataSize = 0 self.testDataSize = 0 with open(train_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) trainUniqueUsers.append(uid) trainUser.extend([uid] * len(items)) trainItem.extend(items) self.m_item = max(self.m_item, max(items)) self.n_user = max(self.n_user, uid) self.traindataSize += len(items) self.trainUniqueUsers = np.array(trainUniqueUsers) self.trainUser = np.array(trainUser) self.trainItem = np.array(trainItem) with open(test_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) testUniqueUsers.append(uid) testUser.extend([uid] * len(items)) testItem.extend(items) self.m_item = max(self.m_item, max(items)) self.n_user = max(self.n_user, uid) self.testDataSize += len(items) self.m_item += 1 self.n_user += 1 self.testUniqueUsers = np.array(testUniqueUsers) self.testUser = np.array(testUser) self.testItem = np.array(testItem) self.Graph = None # (users,items), bipartite graph self.UserItemNet = csr_matrix((np.ones(len(self.trainUser)), (self.trainUser, self.trainItem)), shape=(self.n_user, self.m_item)) self.users_D = np.array(self.UserItemNet.sum(axis=1)).squeeze() self.users_D[self.users_D == 0.] = 1 self.items_D = np.array(self.UserItemNet.sum(axis=0)).squeeze() self.items_D[self.items_D == 0.] = 1. # pre-calculate self.allPos = self.getUserPosItems(list(range(self.n_user))) self.testDict = self.__build_test() self.g = self.getSparseGraph() def get_split(self): return self.g, [], [] def __build_test(self): """ return: dict: {user: [items]} """ test_data = {} for i, item in enumerate(self.testItem): user = self.testUser[i] if test_data.get(user): test_data[user].append(item) else: test_data[user] = [item] return test_data def getUserPosItems(self, users): posItems = [] for user in users: posItems.append(self.UserItemNet[user].nonzero()[1]) return posItems def _convert_sp_mat_to_sp_tensor(self, X): coo = X.tocoo().astype(np.float32) row = th.Tensor(coo.row).long() col = th.Tensor(coo.col).long() index = th.stack([row, col]) data = th.FloatTensor(coo.data) return th.sparse.FloatTensor(index, data, th.Size(coo.shape)) def getSparseGraph(self): print("loading adjacency matrix") if self.Graph is None: try: pre_adj_mat = sp.load_npz(self.path + '/s_pre_adj_mat.npz') print("successfully loaded...") norm_adj = pre_adj_mat except: print("generating adjacency matrix") # s = time() adj_mat = sp.dok_matrix((self.n_user + self.m_item, self.n_user + self.m_item), dtype=np.float32) adj_mat = adj_mat.tolil() R = self.UserItemNet.tolil() adj_mat[:self.n_user, self.n_user:] = R adj_mat[self.n_user:, :self.n_user] = R.T adj_mat = adj_mat.todok() # adj_mat = adj_mat + sp.eye(adj_mat.shape[0]) rowsum = np.array(adj_mat.sum(axis=1)) d_inv = np.power(rowsum, -0.5).flatten() d_inv[np.isinf(d_inv)] = 0. d_mat = sp.diags(d_inv) norm_adj = d_mat.dot(adj_mat) norm_adj = norm_adj.dot(d_mat) norm_adj = norm_adj.tocsr() # end = time() # print(f"costing {end - s}s, saved norm_mat...") sp.save_npz(self.path + '/s_pre_adj_mat.npz', norm_adj) # if self.split == True: # self.Graph = self._split_A_hat(norm_adj) # print("done split matrix") # else: self.Graph = self._convert_sp_mat_to_sp_tensor(norm_adj) # self.Graph = self.Graph.coalesce().to(self.device) self.Graph = self.Graph.coalesce() print("don't split the matrix") return self.Graph def download(self): prefix = 'https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data' required_file = ['train.txt', 'test.txt'] for filename in required_file: url = f"{prefix}/{self.dataset_name}/{filename}" file_path = f"{self.data_path}/{filename}" if not os.path.exists(file_path): try: download(url, file_path) except BaseException as e: print("\n",e) print("\nNote! --- If you want to download the file, vpn is required ---") print("If you don't have a vpn, please download the dataset from here: https://github.com/gusye1234/LightGCN-PyTorch") print("\nAfter downloading the dataset, you need to store the files in the following path: ") print(f"{os.getcwd()}\openhgnn\dataset\{self.dataset_name}\\train.txt") print(f"{os.getcwd()}\openhgnn\dataset\{self.dataset_name}\\test.txt") exit() @register_dataset('hin_recommendation') class HINRecommendation(RecommendationDataset): def __init__(self, dataset_name, *args, **kwargs): super(HINRecommendation, self).__init__(*args, **kwargs) self.dataset_name = dataset_name self.num_neg = 20 #self.neg_dir = os.path.join(self.raw_dir, dataset_name, 'neg_{}.bin'.format(self.num_neg)) if dataset_name == 'yelp4rec': dataset = AcademicDataset(name='yelp4rec', raw_dir='') self.g = dataset[0].long() self.target_link = 'user-item' self.target_link_r = 'item-user' self.user_name = 'user' self.item_name = 'item' elif dataset_name == 'yelp4HeGAN': dataset = AcademicDataset(name='yelp4HeGAN', raw_dir='') self.g = dataset[0].long() self.target_link = 'usb' self.target_link_r = 'bus' self.user_name = 'user' self.item_name = 'business' elif dataset_name == 'DoubanMovie': dataset = AcademicDataset(name='DoubanMovie', raw_dir='') self.g = dataset[0].long() self.target_link = 'user-item' self.target_link_r = 'item-user' self.user_name = 'user' self.item_name = 'item' elif dataset_name == 'amazon': data_path = './openhgnn/dataset/amazon_graph.bin' dataset = AcademicDataset(name='amazon', raw_dir='') self.g = dataset[0].long() self.target_link = 'ui' self.target_link_r = 'iu' self.user_name = 'user' self.item_name = 'item' self.out_ntypes = [self.user_name, self.item_name] # self.process() # self.neg_g = self.construct_negative_graph(self.g) def load_HIN(self, dataset_name): g, _ = dgl.load_graphs(dataset_name) return g[0] # def process(self, g): # # sub 1 for every node # new = {} # for etype in g.canonical_etypes: # edges = g.edges(etype=etype) # new[etype] = (edges[0]-1, edges[1]-1) # hg = dgl.heterograph(new) # hg.edata['val_mask'] = g.edata['val_mask'] # hg.edata['test_mask'] = g.edata['test_mask'] # hg.edata['train_mask'] = g.edata['train_mask'] # from dgl.data.utils import save_graphs # save_graphs(f"./openhgnn/dataset/{self.dataset_name}.bin", hg) def get_split(self, validation=True): test_mask = self.g.edges[self.target_link].data['test_mask'].squeeze() test_index = th.nonzero(test_mask).squeeze() test_edge = self.g.find_edges(test_index, self.target_link) test_graph = dgl.heterograph({(self.user_name, self.target_link, self.item_name): test_edge}, {ntype: self.g.number_of_nodes(ntype) for ntype in self.out_ntypes}) if validation: val_mask = self.g.edges[self.target_link].data['val_mask'].squeeze() val_index = th.nonzero(val_mask).squeeze() val_edge = self.g.find_edges(val_index, self.target_link) val_graph = dgl.heterograph({(self.user_name, self.target_link, self.item_name): val_edge}, {ntype: self.g.number_of_nodes(ntype) for ntype in self.out_ntypes}) train_graph = dgl.remove_edges(self.g, th.cat((val_index, test_index)), self.target_link) train_graph = dgl.remove_edges(train_graph, th.cat((val_index, test_index)), self.target_link_r) else: train_graph = dgl.remove_edges(self.g, test_index, self.target_link) train_graph = dgl.remove_edges(train_graph, test_index, self.target_link_r) val_graph = train_graph return train_graph, val_graph, test_graph def construct_negative_graph(self, train_g): fname = f'./openhgnn/dataset/{self.dataset_name}/neg_graph_{self.num_neg}.bin' if os.path.exists(fname): g, _ = load_graphs(fname) return g[0] else: k = self.num_neg negative_sampler = Uniform_exclusive(k) negative_edges = negative_sampler(train_g.to('cpu'), { self.target_link: th.arange(train_g.num_edges(self.target_link))}) # negative_edges = negative_sampler(train_g.to('cpu'), { # self.target_link: th.arange(10)}) neg_g = dgl.heterograph(negative_edges, {ntype: self.g.number_of_nodes(ntype) for ntype in self.out_ntypes}) dgl.save_graphs(fname, neg_g) return neg_g @register_dataset('test_link_prediction') class Test_Recommendation(RecommendationDataset): def __init__(self, dataset_name): super(RecommendationDataset, self).__init__() self.g = self.load_HIN('./openhgnn/debug/data.bin') self.target_link = 'user-item' self.has_feature = False self.preprocess() #self.generate_negative() def load_HIN(self, dataset_name): g, _ = load_graphs(dataset_name) return g[0] def preprocess(self): test_mask = self.g.edges[self.target_link].data['test_mask'] index = th.nonzero(test_mask).squeeze() self.test_edge = self.g.find_edges(index, self.target_link) self.pos_test_graph = dgl.heterograph({('user', 'user-item', 'item'): self.test_edge}, {ntype: self.g.number_of_nodes(ntype) for ntype in ['user', 'item']}) self.g.remove_edges(index, self.target_link) self.g.remove_edges(index, 'item-user') self.neg_test_graph, _ = dgl.load_graphs('./openhgnn/debug/neg.bin') self.neg_test_graph = self.neg_test_graph[0] return negative_sampler = Uniform_exclusive(99) self.negative_g = negative_sampler(self.hg.to('cpu'), {self.target_link: th.arange(self.hg.num_edges(self.target_link))}) def generate_negative(self): k = 99 e = self.pos_test_graph.edges() neg_src = [] neg_dst = [] for i in range(self.pos_test_graph.number_of_edges()): src = e[0][i] exp = self.pos_test_graph.successors(src) dst = th.randint(high=self.g.number_of_nodes('item'), size=(k,)) for d in range(len(dst)): while dst[d] in exp: dst[d] = th.randint(high=self.g.number_of_nodes('item'), size=(1,)) src = src.repeat_interleave(k) neg_src.append(src) neg_dst.append(dst) neg_edge = (th.cat(neg_src), th.cat(neg_dst)) neg_graph = dgl.heterograph({('user', 'user-item', 'item'): neg_edge}, {ntype: self.g.number_of_nodes(ntype) for ntype in ['user', 'item']}) dgl.save_graphs('./openhgnn/debug/neg.bin', neg_graph) @register_dataset('kgat_recommendation') class KGAT_recommendation(RecommendationDataset): def __init__(self,dataset_name,*args, **kwargs): super(KGAT_recommendation, self).__init__(*args, **kwargs) if dataset_name not in ['yelp2018','amazon-book','last-fm']: raise KeyError('Dataset {} is not supported!'.format(dataset_name)) self.dataset_name=dataset_name self.data_path=f'openhgnn/dataset/{self.dataset_name}' if self.dataset_name=='yelp2018': self.processed_data_path = 'openhgnn/dataset/KGAT_yelp2018' elif self.dataset_name=='amazon-book': self.processed_data_path = 'openhgnn/dataset/KGAT_amazon-book' else: self.processed_data_path = 'openhgnn/dataset/KGAT_last-fm' if os.path.exists(f"{self.processed_data_path}/graph.bin"): if os.path.exists(f"{self.processed_data_path}/other_info.pkl"): self.load() else: self.download() self.preprocess() self.save() def download(self): prefix = 'https://raw.githubusercontent.com/xiangwang1223/knowledge_graph_attention_network/master/Data' if self.dataset_name =='last-fm': required_file = ['train.txt', 'test.txt', 'kg_final.txt'] else: required_file = ['train.txt', 'test.txt', 'kg_final.txt.zip'] for filename in required_file: url = f"{prefix}/{self.dataset_name}/{filename}" file_path = f"{self.data_path}/{filename}" if not os.path.exists(file_path): try: download(url,file_path) if filename == 'kg_final.txt.zip': zip_file = zipfile.ZipFile(file_path) zip_file.extractall(f"{self.data_path}") zip_file.close() except BaseException as e: print("\n",e) print("\nNote! --- If you want to download the file, vpn is required ---") print("If you don't have a vpn, please download the dataset from here: https://github.com/xiangwang1223/knowledge_graph_attention_network/") print("\nAfter downloading the dataset, you need to store the files in the following path: ") print(f"{os.getcwd()}\openhgnn\dataset\{self.dataset_name}\\train.txt") print(f"{os.getcwd()}\openhgnn\dataset\{self.dataset_name}\\test.txt") print(f"{os.getcwd()}\openhgnn\dataset\{self.dataset_name}\kg_final.txt") print(f"{os.getcwd()}\openhgnn\dataset\{self.dataset_name}\pretrain\mf.npz") exit() #load pretrain file prefix=("https://raw.githubusercontent.com/xiangwang1223/knowledge_graph_attention_network/master/Model/pretrain") pretrain_url=f"{prefix}/{self.dataset_name}/mf.npz" self.pretrain_embedding_dir=f"{self.data_path}/pretrain/mf.npz" if not os.path.exists(self.pretrain_embedding_dir): download(pretrain_url,self.pretrain_embedding_dir) print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()), '--- All files have been downloaded! ---') def preprocess(self): print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()), '--- Start Processing! ---') # get data_file_path train_file = f"{self.data_path}/train.txt" test_file = f"{self.data_path}/test.txt" kg_file = f"{self.data_path}/kg_final.txt" self.cf_train_data, self.train_user_dict = self.load_cf(train_file) self.cf_test_data, self.test_user_dict = self.load_cf(test_file) self.statistic_cf() print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()), '--- cf data finish ---') kg_data = self.load_kg(kg_file) print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()), '--- kg data load ---') self.construct_data(kg_data) self.g = self.create_graph(self.kg_train_data, self.n_users_entities) print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()), '--- kg data finish ---') print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()), '--- Done! ---') def load_cf(self, filename): ''' Create user-item bipartite graph ''' user = [] item = [] user_dict = dict() lines = open(filename, 'r').readlines() for l in lines: tmp = l.strip() inter = [int(i) for i in tmp.split()] if len(inter) > 1: user_id, item_ids = inter[0], inter[1:] item_ids = list(set(item_ids)) for item_id in item_ids: user.append(user_id) item.append(item_id) user_dict[user_id] = item_ids user = np.array(user, dtype=np.int32) item = np.array(item, dtype=np.int32) return (user, item), user_dict def statistic_cf(self): ''' Some statistic information of the user-item bipartite graph ''' self.n_users = max(max(self.cf_train_data[0]), max(self.cf_test_data[0])) + 1 self.n_items = max(max(self.cf_train_data[1]), max(self.cf_test_data[1])) + 1 self.n_cf_train = len(self.cf_train_data[0]) self.n_cf_test = len(self.cf_test_data[0]) def load_kg(self, filename): ''' Load knowledge graph data ''' kg_data = pd.read_csv(filename, sep=' ', names=['h', 'r', 't'], engine='python') kg_data = kg_data.drop_duplicates() return kg_data def construct_data(self, kg_data): ''' Alignment of knowledge graph and the user-item bipartite graph ''' n_relations = max(kg_data['r']) + 1 reverse_kg_data = kg_data.copy() reverse_kg_data = reverse_kg_data.rename({'h': 't', 't': 'h'}, axis='columns') reverse_kg_data['r'] += n_relations kg_data = pd.concat([kg_data, reverse_kg_data], axis=0, ignore_index=True, sort=False) kg_data['r'] += 2 self.n_relations = max(kg_data['r']) + 1 self.n_entities = max(max(kg_data['h']), max(kg_data['t'])) +1 self.n_users_entities = self.n_users + self.n_entities self.cf_train_data = \ (np.array(list(map(lambda d: d + self.n_entities, self.cf_train_data[0]))).astype(np.int32), self.cf_train_data[1].astype(np.int32)) self.cf_test_data = \ (np.array(list(map(lambda d: d + self.n_entities, self.cf_test_data[0]))).astype(np.int32), self.cf_test_data[1].astype(np.int32)) self.train_user_dict = {k + self.n_entities: np.unique(v).astype(np.int32) for k, v in self.train_user_dict.items()} self.test_user_dict = {k + self.n_entities: np.unique(v).astype(np.int32) for k, v in self.test_user_dict.items()} cf2kg_train_data = pd.DataFrame(np.zeros((self.n_cf_train, 3), dtype=np.int32), columns=['h', 'r', 't']) cf2kg_train_data['h'] = self.cf_train_data[0] cf2kg_train_data['t'] = self.cf_train_data[1] reverse_cf2kg_train_data = pd.DataFrame(np.ones((self.n_cf_train, 3), dtype=np.int32), columns=['h', 'r', 't']) reverse_cf2kg_train_data['h'] = self.cf_train_data[1] reverse_cf2kg_train_data['t'] = self.cf_train_data[0] self.kg_train_data = pd.concat([kg_data, cf2kg_train_data, reverse_cf2kg_train_data], ignore_index=True) self.n_kg_train = len(self.kg_train_data) # construct kg dict self.train_kg_dict = collections.defaultdict(list) for row in self.kg_train_data.iterrows(): h, r, t = row[1] self.train_kg_dict[h].append((t, r)) def create_graph(self, kg_data, n_nodes): ''' Create DGLgraph ''' g = dgl.graph((kg_data['t'], kg_data['h'])) g.ndata['id'] = th.arange(n_nodes, dtype=th.long) g.edata['type'] = th.LongTensor(kg_data['r']) return g def get_split(self): ''' Make it compatible to recommendation task ''' return None,None,None def get_labels(self): ''' Make it compatible to recommendation task ''' return None def save(self): ''' Store all the information for training ''' #save a dictionary print(f'storing graph and other information in {self.processed_data_path}') info_dict={'n_users':self.n_users,'n_entities':self.n_entities,'n_relations':self.n_relations,'n_items':self.n_items, 'train_user_dict':self.train_user_dict,'test_user_dict':self.test_user_dict,'train_kg_dict':self.train_kg_dict, 'n_cf_train':self.n_cf_train,'n_kg_train':self.n_kg_train} os.makedirs(os.path.dirname(f"{self.processed_data_path}/other_info.pkl"),exist_ok=True) save_info(f"{self.processed_data_path}/other_info.pkl",info_dict) save_graphs(f"{self.processed_data_path}/graph.bin",[self.g]) def load(self): ''' Store all the information for training ''' #load information from a dictionary print(f'loading data from {self.processed_data_path}') new_dict=load_info(f"{self.processed_data_path}/other_info.pkl") self.train_user_dict=new_dict['train_user_dict'] self.test_user_dict=new_dict['test_user_dict'] self.train_kg_dict = new_dict['train_kg_dict'] self.n_users=new_dict['n_users'] self.n_entities=new_dict['n_entities'] self.n_relations=new_dict['n_relations'] self.n_items=new_dict['n_items'] self.n_cf_train=new_dict['n_cf_train'] self.n_kg_train =new_dict['n_kg_train'] self.g=load_graphs(f"{self.processed_data_path}/graph.bin") self.g=self.g[0][0] self.pretrain_embedding_dir = f"{self.data_path}/pretrain/mf.npz" print(time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()), '--- Start Training! ---')