diff --git a/data/pubmed/README.md b/data/pubmed/README.md new file mode 100644 index 0000000..341d103 --- /dev/null +++ b/data/pubmed/README.md @@ -0,0 +1,7 @@ +## LastFM Dataset + +From [HNE](https://github.com/yangji9181/HNE). + +We further split the original training set into training and validation sets. + +We samples hard negative edges for validtaion and testing by ourselves. diff --git a/preprocess_pubmed.ipynb b/preprocess_pubmed.ipynb new file mode 100644 index 0000000..718c408 --- /dev/null +++ b/preprocess_pubmed.ipynb @@ -0,0 +1,291 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 101, + "id": "caacdd92-ba53-4aa9-bddd-94f034a710bd", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import csv\n", + "import random\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import dgl\n", + "import torch as th\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "bd7349d8-c2cd-4c74-bc76-e69b9817f2ba", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "load_path = Path('data/pubmed/raw')\n", + "df_node = pd.read_csv(load_path / 'node.dat', sep='\\t', names=['node_id', 'node_name', 'node_type', 'node_attributes'], quoting=csv.QUOTE_NONE) # must add this quoting argument\n", + "df_link = pd.read_csv(load_path / 'link.dat', sep='\\t', names=['src_id', 'dst_id', 'link_type', 'link_weight'])\n", + "df_link_test = pd.read_csv(load_path / 'link.dat.test', sep='\\t', names=['src_id', 'dst_id', 'link_status'])\n", + "\n", + "ntypes = ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES']\n", + "ntype_ids = {'GENE': 0, 'DISEASE': 1, 'CHEMICAL': 2, 'SPECIES': 3}\n", + "etypes = ['GENE-and-GENE',\n", + " 'GENE-causing-DISEASE',\n", + " 'DISEASE-and-DISEASE',\n", + " 'CHEMICAL-in-GENE',\n", + " 'CHEMICAL-in-DISEASE',\n", + " 'CHEMICAL-and-CHEMICAL',\n", + " 'CHEMICAL-in-SPECIES',\n", + " 'SPECIES-with-GENE',\n", + " 'SPECIES-with-DISEASE',\n", + " 'SPECIES-and-SPECIES']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "56815267-cc18-454f-8c43-003360629e39", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "type_mask = np.zeros(63109, dtype=int)\n", + "type_mask[:] = -1\n", + "type_mask[df_node['node_id']] = df_node['node_type']\n", + "num_nodes_dict = {ntype: (type_mask == ntype_ids[ntype]).sum() for ntype in ntypes}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6d3037ed-c5e2-4352-8e50-95be5c790f52", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "adjM = np.zeros((63109, 63109), dtype=int)\n", + "\n", + "# edges from link.dat\n", + "links = df_link[['src_id', 'dst_id']].to_numpy()\n", + "adjM[links[:, 0], links[:, 1]] = 1\n", + "\n", + "# positive edges from link.dat.test\n", + "links_test = df_link_test[df_link_test['link_status'] == 1].to_numpy()\n", + "adjM[links_test[:, 0], links_test[:, 1]] = 2\n", + "\n", + "# DISEASE-DISEASE matrix\n", + "DD_adjM = adjM[type_mask == ntype_ids['DISEASE']][:, type_mask == ntype_ids['DISEASE']]\n", + "DD_edges = DD_adjM.nonzero()\n", + "train_val_idx = (DD_adjM[DD_edges] == 1).nonzero()[0]\n", + "test_idx = (DD_adjM[DD_edges] == 2).nonzero()[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7c45c341-f994-4f8f-8f7c-957df04daf76", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# sample edges for validation\n", + "train_idx, val_idx = train_test_split(train_val_idx, test_size=0.125, random_state=1024)\n", + "train_idx.sort()\n", + "val_idx.sort()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72984f30-a2dd-490c-851b-77f42a7bf81e", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# g_train\n", + "data_dict_train = {}\n", + "for etype in etypes:\n", + " srctype, _, dsttype = etype.split('-')\n", + " if srctype == dsttype:\n", + " if etype == 'DISEASE-and-DISEASE':\n", + " data_dict_train[('DISEASE', 'DISEASE-and-DISEASE', 'DISEASE')] = (DD_edges[0][train_idx], DD_edges[1][train_idx])\n", + " else:\n", + " data_dict_train[(srctype, etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[srctype]].nonzero()\n", + " else:\n", + " data_dict_train[(srctype, etype, dsttype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].nonzero()\n", + " data_dict_train[(dsttype, '^' + etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].transpose().nonzero()\n", + "\n", + "# g_val\n", + "data_dict_val = {}\n", + "for etype in etypes:\n", + " srctype, _, dsttype = etype.split('-')\n", + " if srctype == dsttype:\n", + " if etype == 'DISEASE-and-DISEASE':\n", + " data_dict_val[('DISEASE', 'DISEASE-and-DISEASE', 'DISEASE')] = (DD_edges[0][train_val_idx], DD_edges[1][train_val_idx])\n", + " else:\n", + " data_dict_val[(srctype, etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[srctype]].nonzero()\n", + " else:\n", + " data_dict_val[(srctype, etype, dsttype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].nonzero()\n", + " data_dict_val[(dsttype, '^' + etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].transpose().nonzero()\n", + "\n", + "# g_test\n", + "data_dict_test = {}\n", + "for etype in etypes:\n", + " srctype, _, dsttype = etype.split('-')\n", + " if srctype == dsttype:\n", + " if etype == 'DISEASE-and-DISEASE':\n", + " data_dict_test[('DISEASE', 'DISEASE-and-DISEASE', 'DISEASE')] = DD_edges\n", + " else:\n", + " data_dict_test[(srctype, etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[srctype]].nonzero()\n", + " else:\n", + " data_dict_test[(srctype, etype, dsttype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].nonzero()\n", + " data_dict_test[(dsttype, '^' + etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].transpose().nonzero()\n", + "\n", + "g_train = dgl.heterograph(data_dict_train, num_nodes_dict, idtype=th.int64)\n", + "g_val = dgl.heterograph(data_dict_val, num_nodes_dict, idtype=th.int64)\n", + "g_test = dgl.heterograph(data_dict_test, num_nodes_dict, idtype=th.int64)\n", + "\n", + "x_dict = {ntype: np.genfromtxt(df_node[df_node['node_type'] == ntype_ids[ntype]]['node_attributes'].tolist(), delimiter=',') for ntype in ntypes}\n", + "for ntype in ntypes:\n", + " temp_tensor = th.from_numpy(x_dict[ntype]).float()\n", + " g_train.nodes[ntype].data['x'] = temp_tensor\n", + " g_val.nodes[ntype].data['x'] = temp_tensor\n", + " g_test.nodes[ntype].data['x'] = temp_tensor\n", + "\n", + "in_dim_dict = {ntype: x_dict[ntype].shape[1] for ntype in ntypes}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f559f20d-c11c-4348-826d-0b959aef737d", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# sample hard negatives for validation\n", + "# for a positive pair (d1, d2), sample (d1, d') as a negative pair\n", + "# where d' associates with d1 via D-D-D metapath (i.e., a 2-hop neighbor) and (d1, d') does not exist in the original graph\n", + "# if such an edge not exist, opt to random edges not in the original graph\n", + "unique, counts = np.unique(DD_edges[0][val_idx], return_counts=True)\n", + "g_val_DDD = dgl.metapath_reachable_graph(g_val, ['DISEASE-and-DISEASE', 'DISEASE-and-DISEASE'])\n", + "val_neg_edges = []\n", + "for d, count in zip(unique, counts):\n", + " neg_diseases = list(set(g_val_DDD.out_edges(d)[1].tolist()) - set(g_val.out_edges(d, etype='DISEASE-and-DISEASE')[1].tolist()))\n", + " if count <= len(neg_diseases):\n", + " neg_ds = random.sample(neg_diseases, k=count)\n", + " else:\n", + " neg_ds = neg_diseases\n", + " to_sample = np.ones(num_nodes_dict['DISEASE'], dtype=int)\n", + " to_sample[g_val.out_edges(d, etype='DISEASE-and-DISEASE')[1].tolist()] = 0\n", + " neg_ds.extend(random.sample(to_sample.nonzero()[0].tolist(), k=count - len(neg_diseases)))\n", + " val_neg_edges.extend([[d, neg_d] for neg_d in neg_ds])\n", + "\n", + "val_neg_edges.sort()\n", + "val_neg_edges = np.array(val_neg_edges)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77493e3a-d4ff-4894-b319-5b3b4443b59e", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# fix testing negative pairs\n", + "# randomly sample edges not in the original graph\n", + "DISEASE_id_map = {global_id: local_id for local_id, global_id in enumerate(df_node[df_node['node_type'] == ntype_ids['DISEASE']]['node_id'])}\n", + "df_link_test_mapped = df_link_test[['src_id', 'dst_id']].replace(DISEASE_id_map)\n", + "df_link_test_mapped_pos = df_link_test_mapped[df_link_test['link_status'] == 1]\n", + "df_link_test_mapped_neg = df_link_test_mapped[df_link_test['link_status'] == 0]\n", + "\n", + "test_pos_nums = df_link_test_mapped_pos.groupby('src_id').size().reindex(list(range(num_nodes_dict['DISEASE'])), fill_value=0).to_numpy()\n", + "test_neg_nums = df_link_test_mapped_neg.groupby('src_id').size().reindex(list(range(num_nodes_dict['DISEASE'])), fill_value=0).to_numpy()\n", + "neg_to_add = test_pos_nums - test_neg_nums\n", + "\n", + "test_neg_edges = df_link_test_mapped_neg.values.tolist()\n", + "for d in neg_to_add.nonzero()[0]:\n", + " to_sample = np.ones(num_nodes_dict['DISEASE'], dtype=int)\n", + " to_sample[g_test.out_edges(d, etype='DISEASE-and-DISEASE')[1].tolist()] = 0\n", + " neg_ds = random.sample(to_sample.nonzero()[0].tolist(), k=neg_to_add[d])\n", + " test_neg_edges.extend([[d, neg_d] for neg_d in neg_ds])\n", + "\n", + "test_neg_edges.sort()\n", + "test_neg_edges = np.array(test_neg_edges)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45377d14-57cb-4731-afc2-568e57dda4f0", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# save everything needed\n", + "save_path = load_path.parent\n", + "dgl.save_graphs(str(save_path / 'graph.bin'), [g_train, g_val, g_test])\n", + "np.savez(save_path / 'train_val_test_idx.npz',\n", + " train_idx=train_idx,\n", + " val_idx=val_idx,\n", + " test_idx=test_idx)\n", + "np.save(save_path / 'val_neg_edges.npy', val_neg_edges)\n", + "np.save(save_path / 'test_neg_edges.npy', test_neg_edges)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dgl-0.7-pytorch-1.10-py39", + "language": "python", + "name": "dgl-0.7-pytorch-1.10-py39" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file