{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DATA Availability\n", "\n", "Data for these notebooks can be found here: https://github.com/VHRanger/Graph-Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import gc\n", "import networkx as nx\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import time\n", "import scipy\n", "import sklearn\n", "from sklearn import cluster, linear_model\n", "from sklearn.decomposition import TruncatedSVD\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.multiclass import OneVsRestClassifier\n", "import sys\n", "import warnings # Silence perf warning\n", "\n", "sys.path.append(os.path.realpath('..'))\n", "\n", "import nodevectors\n", "import csrgraph as cg\n", "from csrgraph import methods\n", "from nodevectors.evaluation import link_pred\n", "from nodevectors.evaluation import graph_eval\n", "\n", "# From the related karateclub lib (on pip)\n", "# https://github.com/benedekrozemberczki/KarateClub\n", "from karateclub.node_embedding.neighbourhood import GraRep, NodeSketch, Walklets\n", "# UMAP to test (on pip)\n", "import umap\n", "\n", "warnings.simplefilter(\"ignore\")\n", "\n", "def nx_node_weights(G, method, **kwargs):\n", " \"\"\"Node Weights through networkX API\"\"\"\n", " pr = np.zeros(len(G))\n", " prdict = method(G, **kwargs)\n", " for i in G.nodes:\n", " pr[i] = prdict[i]\n", " return pr" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#### CONFIG\n", "N_COMPONENTS = 6 # resulting embedding dim\n", "SEED = 42 # RNG Seed\n", "TEST_SIZE = 0.2\n", "\n", "# For resampling tests\n", "RESAMPLE_WALKS = 30\n", "RESAMPLE_LEN = 5" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#### GRAPHS\n", "#### Uncomment one to choose which graph to run evaluation on\n", "\n", "#### Artificial random graphs\n", "# G = nx.binomial_graph(700, 0.6)\n", "# G, labels = graph_eval.make_cluster_graph(n_nodes=820, n_clusters=18, connections=1000, drop_pct=0.5)\n", "G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)\n", "#### Social graphs\n", "# G, labels = graph_eval.make_blogcatalog(dedupe=True)\n", "# G, mlabels = graph_eval.make_blogcatalog(dedupe=False)\n", "# G, labels = graph_eval.make_email()\n", "# G, labels = graph_eval.get_karateclub(\"facebook\") # twitch, github, facebook, wikipedia\n", "# G = graph_eval.get_from_snap(url=\"http://snap.stanford.edu/data/facebook_combined.txt.gz\", sep=' ', header=None, comment='#')\n", "#### Biology Graphs\n", "# G, mlabels = graph_eval.get_n2v_ppi(\"../data/bioNEV/node2vec_PPI\")\n", "\n", "\n", "#### Needs OutOfBounds Nodes support from CSRGraphs to work\n", "# G = graph_eval.get_drugbank_ddi(\"../data/bioNEV/DrugBank_DDI\")\n", "# G, mlabels = graph_eval.get_mashup_ppi(\"../data/bioNEV/Mashup_PPI\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "clusters: 6\n", "Nodes: 500\n", "Edges: 17668\n", "connected: True\n" ] } ], "source": [ "#### For Link Prediction: Split graph into train and test edge sets\n", "#### (All nodes are still present in both)\n", "G_train, testing_pos_edges = link_pred.split_train_test_graph(G, testing_ratio=TEST_SIZE)\n", "\n", "#### Lazy way to set up evaluation\n", "try:\n", " y = labels.label\n", " n_clusters = y.nunique()\n", " HAS_LABELS = True\n", " print(f\"clusters: {n_clusters}\")\n", "except:\n", " try: # Multilabels \n", " y = MultiLabelBinarizer().fit_transform(mlabels.mlabels)\n", " HAS_LABELS = True\n", " print(f\"multilabels: {y.shape[1]}\")\n", " except: # No Labels\n", " HAS_LABELS = False\n", " print(\"No Labels\")\n", "NNODES = len(G)\n", "print(f\"Nodes: {NNODES}\\nEdges: {len(G.edges)}\\nconnected: {nx.is_connected(G_train)}\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.4954\t: 2%|▏ | 101/6000 [00:02<02:51, 34.48it/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.4951\n", "Time: 2.9698\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.516, AUC-PR: 0.507, Acc: 0.511, F1: 0.512\n", "\t(lgbm) AUC-ROC: 0.734, AUC-PR: 0.707, Acc: 0.673, F1: 0.688\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.4937\t: 2%|▏ | 101/6000 [00:00<00:16, 352.27it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.4954\n", "MI: 0.18, RAND 0.30, FM: 0.30\n", "Label Prediction:\n", "\t(logit) Acc: 0.540, F1 micro: 0.540, F1 macro: 0.540\n", "\t(lgbm) Acc: 0.450, F1 micro: 0.450, F1 macro: 0.450\n" ] } ], "source": [ "ggvec_params = dict(\n", " n_components=N_COMPONENTS,\n", " order=1,\n", " tol=0.1,\n", " tol_samples=100,\n", " max_epoch=6_000,\n", " learning_rate=0.1,\n", " negative_ratio=0.05,\n", " exponent=0.33,\n", " verbose=True,\n", ")\n", "\n", "start_t = time.time()\n", "w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)\n", "\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "time.sleep(0.1)\n", "if HAS_LABELS:\n", " w = nodevectors.GGVec(**ggvec_params).fit_transform(G)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Making walks... Done, T=1.92\n", "Mapping Walk Names... Done, T=0.10\n", "Training W2V... Done, T=0.32\n", "Time: 2.3881\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.552, AUC-PR: 0.482, Acc: 0.539, F1: 0.534\n", "\t(lgbm) AUC-ROC: 0.948, AUC-PR: 0.930, Acc: 0.918, F1: 0.920\n", "Making walks... Done, T=0.01\n", "Mapping Walk Names... Done, T=0.13\n", "Training W2V... Done, T=0.31\n", "MI: 0.93, RAND 0.86, FM: 0.86\n", "Label Prediction:\n", "\t(logit) Acc: 0.940, F1 micro: 0.940, F1 macro: 0.940\n", "\t(lgbm) Acc: 0.950, F1 micro: 0.950, F1 macro: 0.950\n" ] } ], "source": [ "n2v_params = dict(\n", " n_components=N_COMPONENTS,\n", " epochs=5,\n", " walklen=30,\n", " return_weight=1.,\n", " neighbor_weight=1.,\n", " w2vparams={\n", " \"window\":3, \n", " \"negative\":5, \n", " \"iter\":2,\n", " \"batch_words\":128}\n", ")\n", "\n", "start_t = time.time()\n", "w_train = nodevectors.Node2Vec(**n2v_params).fit_transform(G_train)\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "if HAS_LABELS:\n", " w = nodevectors.Node2Vec(**n2v_params).fit_transform(G)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time: 0.0773\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.528, AUC-PR: 0.463, Acc: 0.538, F1: 0.540\n", "\t(lgbm) AUC-ROC: 0.951, AUC-PR: 0.940, Acc: 0.928, F1: 0.928\n", "MI: 0.87, RAND 0.82, FM: 0.82\n", "Label Prediction:\n", "\t(logit) Acc: 0.980, F1 micro: 0.980, F1 macro: 0.980\n", "\t(lgbm) Acc: 0.990, F1 micro: 0.990, F1 macro: 0.990\n" ] } ], "source": [ "pne_params = dict(\n", " n_components=N_COMPONENTS,\n", " step=5,\n", " mu=0.2,\n", " theta=0.5,\n", ")\n", "\n", "start_t = time.time()\n", "pne = nodevectors.ProNE(**pne_params)\n", "w_train = pne.fit_transform(G_train)\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "if HAS_LABELS:\n", " pne = nodevectors.ProNE(**pne_params)\n", " w = pne.fit_transform(G)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 2/2 [00:00<00:00, 17.00it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time: 0.2583\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.515, AUC-PR: 0.453, Acc: 0.565, F1: 0.599\n", "\t(lgbm) AUC-ROC: 0.957, AUC-PR: 0.939, Acc: 0.941, F1: 0.940\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 2/2 [00:00<00:00, 17.91it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "MI: 1.00, RAND 1.00, FM: 1.00\n", "Label Prediction:\n", "\t(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n", "\t(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "grarep_params = dict(\n", " n_components=N_COMPONENTS,\n", " order=2,\n", " embedder=TruncatedSVD(\n", " n_iter=10,\n", " random_state=42),\n", " merger=(lambda x : np.sum(x, axis=0)),\n", ")\n", "\n", "start_t = time.time()\n", "w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)\n", "\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "time.sleep(0.1)\n", "if HAS_LABELS:\n", " w = nodevectors.GraRep(**grarep_params).fit_transform(G)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.0229\t: 4%|▍ | 228/6000 [00:02<01:11, 80.47it/s]\n", "Loss: 0.0241\t: 0%| | 7/6000 [00:00<01:27, 68.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.0225\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.0158\t: 4%|▎ | 216/6000 [00:03<01:23, 69.54it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.0156\n", "Time: 6.0436\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.534, AUC-PR: 0.465, Acc: 0.513, F1: 0.513\n", "\t(lgbm) AUC-ROC: 0.953, AUC-PR: 0.939, Acc: 0.931, F1: 0.932\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.0229\t: 4%|▎ | 218/6000 [00:02<01:07, 85.74it/s]\n", "Loss: 0.0243\t: 0%| | 7/6000 [00:00<01:32, 64.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.0229\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.0155\t: 4%|▎ | 214/6000 [00:03<01:27, 66.29it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.0155\n", "MI: 1.00, RAND 1.00, FM: 1.00\n", "Label Prediction:\n", "\t(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n", "\t(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n" ] } ], "source": [ "##### GraRep + GGVec ####\n", "grarep_params = dict(\n", " n_components=N_COMPONENTS,\n", " order=2,\n", " embedder=nodevectors.GGVec(\n", " n_components=N_COMPONENTS,\n", " tol=0.1,\n", " tol_samples=200,\n", " max_epoch=6_000,\n", " learning_rate=0.02,\n", " negative_ratio=0.6,\n", " exponent=0.33,\n", " verbose=True,\n", " ),\n", " verbose=False,\n", " merger=(lambda x : np.sum(x, axis=0)),\n", ")\n", "\n", "start_t = time.time()\n", "w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)\n", "\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "time.sleep(0.1)\n", "if HAS_LABELS:\n", " w = nodevectors.GraRep(**grarep_params).fit_transform(G)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time: 3.8006\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.541, AUC-PR: 0.472, Acc: 0.534, F1: 0.537\n", "\t(lgbm) AUC-ROC: 0.952, AUC-PR: 0.938, Acc: 0.939, F1: 0.939\n", "MI: 1.00, RAND 1.00, FM: 1.00\n", "Label Prediction:\n", "\t(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n", "\t(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n" ] } ], "source": [ "ump_params = dict(\n", " embedder=umap.UMAP,\n", " n_neighbors=3,\n", " min_dist=0.,\n", " metric='cosine',\n", " normalize_graph=True,\n", " n_components=N_COMPONENTS,\n", ")\n", "\n", "start_t = time.time()\n", "w_train = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G_train)\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "if HAS_LABELS:\n", " w = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 1%|▏ | 76/6000 [00:02<02:54, 33.95it/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time: 2.9679\n", "Virtual edges: 53851\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.535, AUC-PR: 0.472, Acc: 0.527, F1: 0.525\n", "\t(lgbm) AUC-ROC: 0.944, AUC-PR: 0.936, Acc: 0.904, F1: 0.906\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 5%|▌ | 327/6000 [00:01<00:24, 236.32it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "MI: 1.00, RAND 1.00, FM: 1.00\n", "Label Prediction:\n", "\t(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n", "\t(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n" ] } ], "source": [ "### GLoVe with random walks ###\n", "glove_params = dict(\n", " n_components=N_COMPONENTS,\n", " tol=0.0005,\n", " max_epoch=6_000,\n", " learning_rate=0.02, \n", " max_loss=10.,\n", " max_count=50, \n", " exponent=0.5,\n", ")\n", "\n", "start_t = time.time()\n", "wg = cg.csrgraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)\n", "w_train = nodevectors.Glove(**glove_params).fit_transform(wg)\n", "\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "print(f\"Virtual edges: {wg.dst.size}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "if HAS_LABELS:\n", " wg = cg.csrgraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)\n", " w = nodevectors.Glove(**glove_params).fit_transform(wg)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.2861\t: 16%|█▌ | 967/6000 [00:05<00:26, 188.58it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.2859\n", "Time: 5.6420\n", "Virtual edges: 54151\n", "Link Prediction:\n", "\t(logit) AUC-ROC: 0.534, AUC-PR: 0.485, Acc: 0.527, F1: 0.530\n", "\t(lgbm) AUC-ROC: 0.958, AUC-PR: 0.944, Acc: 0.937, F1: 0.937\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loss: 0.2796\t: 15%|█▌ | 911/6000 [00:03<00:18, 270.13it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Converged! Loss: 0.2795\n", "MI: 1.00, RAND 1.00, FM: 1.00\n", "Label Prediction:\n", "\t(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000\n", "\t(lgbm) Acc: 0.990, F1 micro: 0.990, F1 macro: 0.990\n" ] } ], "source": [ "### GGVec with random walks ###\n", "ggvec_params = dict(\n", " n_components=N_COMPONENTS,\n", " tol=0.02,\n", " tol_samples=200,\n", " max_epoch=6_000,\n", " learning_rate=0.02,\n", " negative_ratio=0.3,\n", " exponent=0.35,\n", " verbose=True,\n", ")\n", "\n", "start_t = time.time()\n", "wg = cg.csrgraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)\n", "w_train = wg.ggvec(**ggvec_params)\n", "\n", "print(f\"Time: {time.time() - start_t :.4f}\")\n", "print(f\"Virtual edges: {wg.dst.size}\")\n", "result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "if HAS_LABELS:\n", " wg = cg.csrgraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)\n", " w = wg.ggvec(**ggvec_params)\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "###### Slooooowwwwwww ########\n", "# walklets_params = dict(\n", "# walk_number=10, \n", "# walk_length=30, \n", "# dimensions=N_COMPONENTS,\n", "# window_size=4,\n", "# epochs=1, \n", "# learning_rate=0.05\n", "# )\n", "\n", "# try: # Karateclub models don't handle certain graphs\n", "# start_t = time.time()\n", "# model = Walklets(**walklets_params)\n", "# model.fit(G_train)\n", "# print(f\"Time: {time.time() - start_t :.3f}\")\n", "# w_train = model.get_embedding()\n", "# result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)\n", "# if HAS_LABELS:\n", "# model = Walklets(**walklets_params)\n", "# model.fit(G)\n", "# w = model.get_embedding()\n", "# graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)\n", "# except: pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### Completely random baseline ###\n", "\n", "w = np.random.randn(len(G), N_COMPONENTS)\n", "\n", "result = link_pred.LinkPrediction(w, G, G_train, testing_pos_edges)\n", "try:\n", " graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)\n", "except: pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }