From 183ef077b4e511f24fb069f7e2ce78c6f367d58c Mon Sep 17 00:00:00 2001 From: Xinyu Fu Date: Thu, 21 Jul 2022 15:51:28 +0800 Subject: [PATCH] add support for PubMed --- configs/HGT.json | 5 +++ configs/MECCH.json | 8 ++++ configs/RGCN.json | 4 ++ main.py | 83 ++++++++++++----------------------------- model/MECCH.py | 31 +++++++++++---- model/baselines/HAN.py | 8 ++++ model/baselines/HGT.py | 34 +++++++++++++++++ model/baselines/RGCN.py | 32 ++++++++++++++++ utils.py | 66 ++++++++++++++++++++++++++++++++ 9 files changed, 204 insertions(+), 67 deletions(-) diff --git a/configs/HGT.json b/configs/HGT.json index e30200b..b817c4f 100644 --- a/configs/HGT.json +++ b/configs/HGT.json @@ -18,5 +18,10 @@ "lastfm": { "n_layers": 1, "lr": 0.01 + }, + "pubmed": { + "n_layers": 3, + "lr": 0.01, + "weight_decay": 0.0 } } diff --git a/configs/MECCH.json b/configs/MECCH.json index e0694d1..2352858 100644 --- a/configs/MECCH.json +++ b/configs/MECCH.json @@ -30,5 +30,13 @@ "weight_decay": 0.0, "batch_size": 102400, "exclude": false + }, + "pubmed": { + "max_mp_length": 1, + "n_layers": 3, + "lr": 0.02, + "weight_decay": 0.0, + "batch_size": 102400, + "exclude": false } } diff --git a/configs/RGCN.json b/configs/RGCN.json index 96f37c6..af153af 100644 --- a/configs/RGCN.json +++ b/configs/RGCN.json @@ -17,5 +17,9 @@ "n_layers": 2, "lr": 0.02, "dropout": 0.0 + }, + "pubmed": { + "n_layers": 2, + "dropout": 0.0 } } diff --git a/main.py b/main.py index a1a5e3b..0734f1a 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ import argparse -import json +import pickle +from pathlib import Path import dgl import numpy as np @@ -12,62 +13,8 @@ from model.baselines.HGT import HGT from model.baselines.HAN import HAN, HAN_lp from model.modules import LinkPrediction_minibatch, LinkPrediction_fullbatch -from utils import metapath2str, add_metapath_connection, get_all_metapaths, load_data_nc, load_data_lp, \ - metapath_dict2list, select_metapaths, get_save_path - - -def load_base_config(path='./configs/base.json'): - with open(path) as f: - config = json.load(f) - print('Base configs loaded.') - return config - - -def load_model_config(path, dataset): - with open(path) as f: - config = json.load(f) - print('Model configs loaded.') - if dataset in config: - config_out = config['default'] - config_out.update(config[dataset]) - print('{} dataset configs for this model loaded, override defaults.'.format(dataset)) - return config_out - else: - print('Model do not have hyperparameter configs for {} dataset, use defaults.'.format(dataset)) - return config['default'] - - -def get_metapath_g(g, args): - # Generate the metapath neighbor graphs of all possible metapaths - # and integrate them into one dgl.DGLGraph -- metapath_g - all_metapaths_dict = get_all_metapaths(g, max_length=args.max_mp_length) - all_metapaths_list = metapath_dict2list(all_metapaths_dict) - metapath_g = None - for mp in all_metapaths_list: - metapath_g = add_metapath_connection(g, mp, metapath_g) - # copy features and labels - metapath_g.ndata["x"] = g.ndata["x"] - metapath_g.ndata["y"] = g.ndata["y"] - # select only max-length metapath - selected_metapaths = select_metapaths(all_metapaths_list, length=args.max_mp_length) - - return metapath_g, selected_metapaths - - -def get_khop_g(g, args): - homo_g = dgl.to_homogeneous(g) - temp_homo_g = dgl.to_homogeneous(g) - homo_g.edata[dgl.ETYPE][:] = 0 - homo_g.edata[dgl.EID] = th.arange(homo_g.num_edges()) - for k in range(2, args.max_mp_length + 1): - edges = dgl.khop_graph(temp_homo_g, k).edges() - etypes = th.full((edges[0].shape[0],), k - 1) - eids = th.arange(edges[0].shape[0]) - homo_g.add_edges(edges[0], edges[1], {dgl.ETYPE: etypes, dgl.EID: eids}) - hetero_g = dgl.to_heterogeneous(homo_g, g.ntypes, ['{}-hop'.format(i + 1) for i in range(args.max_mp_length)]) - hetero_g.ndata['x'] = g.ndata['x'] - hetero_g.ndata['y'] = g.ndata['y'] - return hetero_g +from utils import metapath2str, get_metapath_g, get_khop_g, load_data_nc, load_data_lp, \ + get_save_path, load_base_config, load_model_config def main_nc(args): @@ -192,6 +139,7 @@ def main_lp(args): # load data (g_train, g_val, g_test), in_dim_dict, (train_eid_dict, val_eid_dict, test_eid_dict), ( val_neg_uv, test_neg_uv) = load_data_lp(args.dataset) + print("Loaded data from dataset: {}".format(args.dataset)) # check cuda use_cuda = args.gpu >= 0 and th.cuda.is_available() @@ -232,9 +180,22 @@ def main_lp(args): test_eid_dict = {metapath2str([g_test.to_canonical_etype(k)]): v for k, v in test_eid_dict.items()} target_etype = list(train_eid_dict.keys())[0] - g_train, _ = get_metapath_g(g_train, args) - g_val, _ = get_metapath_g(g_val, args) - g_test, selected_metapaths = get_metapath_g(g_test, args) + # cache metapath_g + load_path = Path('./data') / args.dataset / 'metapath_g-max_mp={}'.format(args.max_mp_length) + if load_path.is_dir(): + g_list, _ = dgl.load_graphs(str(load_path / 'graph.bin')) + g_train, g_val, g_test = g_list + with open(load_path / 'selected_metapaths.pkl', 'rb') as in_file: + selected_metapaths = pickle.load(in_file) + else: + g_train, _ = get_metapath_g(g_train, args) + g_val, _ = get_metapath_g(g_val, args) + g_test, selected_metapaths = get_metapath_g(g_test, args) + load_path.mkdir() + dgl.save_graphs(str(load_path / 'graph.bin'), [g_train, g_val, g_test]) + with open(load_path / 'selected_metapaths.pkl', 'wb') as out_file: + pickle.dump(selected_metapaths, out_file) + n_heads_list = [args.n_heads] * args.n_layers model = MECCH( g_train, @@ -290,6 +251,8 @@ def main_lp(args): minibatch_flag = False elif args.model == 'HAN': # assume the target node type has attributes + # Note: this HAN version from DGL conducts full-batch training with online metapath_reachable_graph, + # preprocessing needed for the PubMed dataset assert args.hidden_dim % args.n_heads == 0 n_heads_list = [args.n_heads] * args.n_layers model_lp = HAN_lp( diff --git a/model/MECCH.py b/model/MECCH.py index 55a5271..4f83de5 100644 --- a/model/MECCH.py +++ b/model/MECCH.py @@ -141,15 +141,16 @@ def __init__(self, n_metapaths, in_dim, out_dim, fusion_type="conv"): def forward(self, h_list): if self.fusion_type == "mean": - return self.linear(th.mean(th.stack(h_list), dim=0)) + fused = th.mean(th.stack(h_list), dim=0) elif self.fusion_type == "weight": - return self.linear(th.sum(th.stack(h_list) * self.weight[:, None, None], dim=0)) + fused = th.sum(th.stack(h_list) * self.weight[:, None, None], dim=0) elif self.fusion_type == "conv": - return self.linear(th.sum(th.stack(h_list).transpose(0, 1) * self.conv, dim=1)) + fused = th.sum(th.stack(h_list).transpose(0, 1) * self.conv, dim=1) elif self.fusion_type == "cat": - return self.linear(th.hstack(h_list)) + fused = th.hstack(h_list) else: raise NotImplementedError + return self.linear(fused), fused class MECCHLayer(nn.Module): @@ -210,12 +211,13 @@ def forward(self, block, h_dict): block.dstnodes[ntype].data["h_dst"] = h_dict[ntype][:block.num_dst_nodes(ntype)] out_h_dict = {} + out_embs_dict = {} for ntype in block.dsttypes: if block.num_dst_nodes(ntype) > 0: metapath_outs = [] for metapath_str in self.metapaths_dict[ntype]: metapath_outs.append(self.context_encoders[metapath_str](block, h_dict, metapath_str)) - out_h_dict[ntype] = self.metapath_fuse[ntype](metapath_outs) + out_h_dict[ntype], out_embs_dict[ntype] = self.metapath_fuse[ntype](metapath_outs) for ntype in out_h_dict: if self.residual is not None: @@ -228,7 +230,7 @@ def forward(self, block, h_dict): out_h_dict[ntype] = self.activation(out_h_dict[ntype]) out_h_dict[ntype] = self.dropout(out_h_dict[ntype]) - return out_h_dict + return out_h_dict, out_embs_dict class MECCH(nn.Module): @@ -299,10 +301,25 @@ def forward(self, blocks, x_dict): h_dict = h_embed_dict | h_linear_dict for block, layer in zip(blocks, self.MECCH_layers): - h_dict = layer(block, h_dict) + h_dict, _ = layer(block, h_dict) return h_dict + # used to get node representations for node classification tasks + # (i.e., the node vectors just before applying the final linear layer of the last MECCH layer) + def get_embs(self, blocks, x_dict): + nids_dict = {ntype: nids for ntype, nids in blocks[0].srcdata[dgl.NID].items() if self.in_dim_dict[ntype] < 0} + + # ntype-specific embedding/projection + h_embed_dict = self.embed_layer(nids_dict) + h_linear_dict = self.linear_layer(x_dict) + h_dict = h_embed_dict | h_linear_dict + + for block, layer in zip(blocks, self.MECCH_layers): + h_dict, embs_dict = layer(block, h_dict) + + return h_dict, embs_dict + class khopMECCHLayer(nn.Module): def __init__( diff --git a/model/baselines/HAN.py b/model/baselines/HAN.py index d80f2a6..dc71c45 100644 --- a/model/baselines/HAN.py +++ b/model/baselines/HAN.py @@ -112,6 +112,14 @@ def forward(self, g, x_dict): return {self.target_ntype: self.predict(h)} + def get_embs(self, g, x_dict): + h = x_dict[self.target_ntype] + + for gnn in self.layers: + h = gnn(g, h) + + return {self.target_ntype: self.predict(h)}, {self.target_ntype: h} + class HAN_lp(nn.Module): def __init__(self, g, metapaths_u, target_ntype_u, in_size_u, metapaths_v, target_ntype_v, in_size_v, hidden_size, diff --git a/model/baselines/HGT.py b/model/baselines/HGT.py index 628aaaf..b3473d2 100644 --- a/model/baselines/HGT.py +++ b/model/baselines/HGT.py @@ -211,3 +211,37 @@ def forward(self, G, x_dict): for i in range(self.n_layers): h_dict = self.gcs[i](G, h_dict) return {ntype: self.out(h) for ntype, h in h_dict.items()} + + def get_embs(self, G, x_dict): + h_dict = {} + if isinstance(G, list): + # minibatch + nids_dict = { + ntype: nids + for ntype, nids in G[0].srcdata[dgl.NID].items() + if self.in_dim_dict[ntype] < 0 + } + h_embed_dict = self.embed_layer(nids_dict) + h_linear_dict = self.linear_layer(x_dict) + h_dict = h_embed_dict | h_linear_dict + for ntype in h_dict: + h_dict[ntype] = F.gelu(h_dict[ntype]) + + for layer, block in zip(self.gcs, G): + h_dict = layer(block, h_dict) + else: + # full batch + nids_dict = { + ntype: G.nodes(ntype) + for ntype in G.ntypes + if self.in_dim_dict[ntype] < 0 + } + h_embed_dict = self.embed_layer(nids_dict) + h_linear_dict = self.linear_layer(x_dict) + h_dict = h_embed_dict | h_linear_dict + for ntype in h_dict: + h_dict[ntype] = F.gelu(h_dict[ntype]) + + for i in range(self.n_layers): + h_dict = self.gcs[i](G, h_dict) + return {ntype: self.out(h) for ntype, h in h_dict.items()}, h_dict diff --git a/model/baselines/RGCN.py b/model/baselines/RGCN.py index 90ccc3d..b0f256b 100644 --- a/model/baselines/RGCN.py +++ b/model/baselines/RGCN.py @@ -237,3 +237,35 @@ def forward(self, g=None, x_dict=None): h_dict = layer(g, h_dict) return h_dict + + def get_embs(self, g=None, x_dict=None): + if isinstance(g, list): + # minibatch forward + nids_dict = { + ntype: nids + for ntype, nids in g[0].srcdata[dgl.NID].items() + if self.in_dim_dict[ntype] < 0 + } + h_embed_dict = self.embed_layer(nids_dict) + h_linear_dict = self.linear_layer(x_dict) + h_dict = h_embed_dict | h_linear_dict + + for layer, block in zip(self.layers, g): + embs_dict = h_dict + h_dict = layer(block, h_dict) + else: + # full graph forward + nids_dict = { + ntype: g.nodes(ntype) + for ntype in g.ntypes + if self.in_dim_dict[ntype] < 0 + } + h_embed_dict = self.embed_layer(nids_dict) + h_linear_dict = self.linear_layer(x_dict) + h_dict = h_embed_dict | h_linear_dict + + for layer in self.layers: + embs_dict = h_dict + h_dict = layer(g, h_dict) + + return h_dict, embs_dict diff --git a/utils.py b/utils.py index 80f57b2..645fae0 100644 --- a/utils.py +++ b/utils.py @@ -2,12 +2,34 @@ from collections import defaultdict from pathlib import Path import shutil +import json import dgl import numpy as np import torch as th +def load_base_config(path='./configs/base.json'): + with open(path) as f: + config = json.load(f) + print('Base configs loaded.') + return config + + +def load_model_config(path, dataset): + with open(path) as f: + config = json.load(f) + print('Model configs loaded.') + if dataset in config: + config_out = config['default'] + config_out.update(config[dataset]) + print('{} dataset configs for this model loaded, override defaults.'.format(dataset)) + return config_out + else: + print('Model do not have hyperparameter configs for {} dataset, use defaults.'.format(dataset)) + return config['default'] + + def get_all_metapaths(g, min_length=1, max_length=4): etype_dict = {} for src, e, dst in g.canonical_etypes: @@ -76,6 +98,39 @@ def select_metapaths(all_metapaths_list, length=4): return dict(selected_metapaths) +def get_metapath_g(g, args): + # Generate the metapath neighbor graphs of all possible metapaths + # and integrate them into one dgl.DGLGraph -- metapath_g + all_metapaths_dict = get_all_metapaths(g, max_length=args.max_mp_length) + all_metapaths_list = metapath_dict2list(all_metapaths_dict) + metapath_g = None + for mp in all_metapaths_list: + metapath_g = add_metapath_connection(g, mp, metapath_g) + # copy features and labels + metapath_g.ndata["x"] = g.ndata["x"] + metapath_g.ndata["y"] = g.ndata["y"] + # select only max-length metapath + selected_metapaths = select_metapaths(all_metapaths_list, length=args.max_mp_length) + + return metapath_g, selected_metapaths + + +def get_khop_g(g, args): + homo_g = dgl.to_homogeneous(g) + temp_homo_g = dgl.to_homogeneous(g) + homo_g.edata[dgl.ETYPE][:] = 0 + homo_g.edata[dgl.EID] = th.arange(homo_g.num_edges()) + for k in range(2, args.max_mp_length + 1): + edges = dgl.khop_graph(temp_homo_g, k).edges() + etypes = th.full((edges[0].shape[0],), k - 1) + eids = th.arange(edges[0].shape[0]) + homo_g.add_edges(edges[0], edges[1], {dgl.ETYPE: etypes, dgl.EID: eids}) + hetero_g = dgl.to_heterogeneous(homo_g, g.ntypes, ['{}-hop'.format(i + 1) for i in range(args.max_mp_length)]) + hetero_g.ndata['x'] = g.ndata['x'] + hetero_g.ndata['y'] = g.ndata['y'] + return hetero_g + + def load_data_nc(dataset_name, prefix="./data"): if dataset_name == "imdb-gtn": # movie*, actor, director @@ -230,6 +285,17 @@ def load_data_lp(dataset_name, prefix="./data"): val_neg_uv = th.tensor(np.load(str(load_path / 'val_neg_user_artist.npy'))) test_neg_uv = th.tensor(np.load(str(load_path / 'test_neg_user_artist.npy'))) in_dim_dict = {ntype: -1 for ntype in g_test.ntypes} + elif dataset_name == 'pubmed': + load_path = Path(prefix, dataset_name) + g_list, _ = dgl.load_graphs(str(load_path / 'graph.bin')) + g_train, g_val, g_test = g_list + train_val_test_idx = np.load(str(load_path / 'train_val_test_idx.npz')) + train_eid_dict = {'DISEASE-and-DISEASE': th.tensor(train_val_test_idx['train_idx'])} + val_eid_dict = {'DISEASE-and-DISEASE': th.tensor(train_val_test_idx['val_idx'])} + test_eid_dict = {'DISEASE-and-DISEASE': th.tensor(train_val_test_idx['test_idx'])} + val_neg_uv = th.tensor(np.load(str(load_path / 'val_neg_edges.npy'))) + test_neg_uv = th.tensor(np.load(str(load_path / 'test_neg_edges.npy'))) + in_dim_dict = {ntype: g_test.nodes[ntype].data['x'].shape[1] for ntype in g_test.ntypes} else: raise NotImplementedError